/usr/bin/macs2

#! /usr/bin/python
# Time-stamp: <2011-11-02 15:03:03 Tao Liu>

"""Description: MACS 2 main executable

Copyright (c) 2008,2009 Yong Zhang, Tao Liu <taoliu@jimmy.harvard.edu>
Copyright (c) 2010,2011 Tao Liu <taoliu@jimmy.harvard.edu>

This code is free software; you can redistribute it and/or modify it
under the terms of the Artistic License (see the file COPYING included
with the distribution).

@status: release candidate
@version: $Id$
@author:  Yong Zhang, Tao Liu
@contact: taoliu@jimmy.harvard.edu
"""

# ------------------------------------
# python modules
# ------------------------------------

import os
import sys
import logging
from subprocess import Popen,PIPE
from optparse import OptionParser
import gzip

# ------------------------------------
# own python modules
# ------------------------------------
from MACS2.OptValidator import opt_validate
from MACS2.OutputWriter import *
from MACS2.cProb import binomial_cdf_inv
from MACS2.PeakModel import PeakModel,NotEnoughPairsException
from MACS2.cPeakDetect import PeakDetect
from MACS2.Constants import *
# ------------------------------------
# Main function
# ------------------------------------
def main():
    """The Main function/pipeline for MACS.
    
    """
    # Parse options...
    options = opt_validate(prepare_optparser())
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error
    #0 output arguments
    info("\n"+options.argtxt)
    
    #1 Read tag files
    info("#1 read tag files...")
    (treat, control) = load_tag_files_options (options)
    # check common chromosome names
    if control:
        tchrnames = set(treat.get_chr_names())
        cchrnames = set(control.get_chr_names())
        commonnames = tchrnames.intersection(cchrnames)
        if len(commonnames)==0:
            error("No common chromosome names can be found from treatment and control! Check your input files! MACS will quit...")
            error("Chromosome names in treatment: %s" % ",".join(sorted(tchrnames)))
            error("Chromosome names in control: %s" % ",".join(sorted(cchrnames)))
            sys.exit()
    
    info("#1 tag size = %d" % options.tsize)
    tagsinfo  = "# tag size is determined as %d bps\n" % (options.tsize)

    t0 = treat.total
    tagsinfo += "# total tags in treatment: %d\n" % (t0)
    info("#1  total tags in treatment: %d" % (t0))
    if options.keepduplicates != "all":
        if options.keepduplicates == "auto":
            info("#1 calculate max duplicate tags in single position based on binomal distribution...")
            treatment_max_dup_tags = cal_max_dup_tags(options.gsize,t0)
            info("#1  max_dup_tags based on binomal = %d" % (treatment_max_dup_tags))
            info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (treatment_max_dup_tags))
        else:
            info("#1 user defined the maximum tags...")
            treatment_max_dup_tags = int(options.keepduplicates)
            info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (treatment_max_dup_tags))

        treat.filter_dup(treatment_max_dup_tags)
        t1 = treat.total
        info("#1  tags after filtering in treatment: %d" % (t1))
        tagsinfo += "# tags after filtering in treatment: %d\n" % (t1)
        tagsinfo += "# maximum duplicate tags at the same position in treatment = %d\n" % (treatment_max_dup_tags)
        info("#1  Redundant rate of treatment: %.2f" % (float(t0-t1)/t0))
        tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0)            

    if control:
        c0 = control.total
        tagsinfo += "# total tags in control: %d\n" % (c0)
        info("#1  total tags in control: %d" % (c0))
        if options.keepduplicates != "all":
            if options.keepduplicates == "auto":
                info("#1  for control, calculate max duplicate tags in single position based on binomal distribution...")
                control_max_dup_tags = cal_max_dup_tags(options.gsize,c0)
                info("#1  max_dup_tags based on binomal = %d" % (control_max_dup_tags))
                info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (control_max_dup_tags))
            else:
                info("#1 user defined the maximum tags...")
                control_max_dup_tags = int(options.keepduplicates)
                info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (treatment_max_dup_tags))
            
            control.filter_dup(control_max_dup_tags)
            c1 = control.total
            info("#1  tags after filtering in control: %d" % (c1))
            tagsinfo += "# tags after filtering in control: %d\n" % (c1)
            tagsinfo += "# maximum duplicate tags at the same position in control = %d\n" % (control_max_dup_tags)
            
            info("#1  Redundant rate of control: %.2f" % (float(c0-c1)/c0))
            tagsinfo += "# Redundant rate in control: %.2f\n" % (float(c0-c1)/c0)    
        
            
    info("#1 finished!")

    #2 Build Model
    info("#2 Build Peak Model...")

    if options.nomodel:
        info("#2 Skipped...")
        options.d=options.shiftsize*2
        info("#2 Use %d as shiftsize, %d as fragment length" % (options.shiftsize,options.d))
        options.scanwindow=2*options.d  # remove the effect of --bw
    else:
        try:
            peakmodel = PeakModel(treatment = treat,
                                  max_pairnum = MAX_PAIRNUM,
                                  opt = options
                                  )
            info("#2 finished!")
            debug("#2  Summary Model:")
            debug("#2   min_tags: %d" % (peakmodel.min_tags))
            debug("#2   d: %d" % (peakmodel.d))
            debug("#2   scan_window: %d" % (peakmodel.scan_window))
            info("#2 predicted fragment length is %d bps" % peakmodel.d)
            info("#2.2 Generate R script for model : %s" % (options.modelR))
            model2r_script(peakmodel,options.modelR,options.name)
            options.d = peakmodel.d
            options.scanwindow= 2*options.d
            if options.onauto and options.d <= 2*options.tsize:
                options.d=options.shiftsize*2
                options.scanwindow=2*options.d 
                warn("#2 Since the d calculated from paired-peaks are smaller than 2*tag length, it may be influenced by unknown sequencing problem. MACS will use %d as shiftsize, %d as fragment length. NOTE: if the d calculated is still acceptable, please do not use --auto-bimodal option!" % (options.shiftsize,options.d))
                
        except NotEnoughPairsException:
            if not options.onauto:
                sys.exit(1)
            warn("#2 Skipped...")
            options.d=options.shiftsize*2
            options.scanwindow=2*options.d 
            warn("#2 Since --auto-bimodal is set, MACS will use %d as shiftsize, %d as fragment length" % (options.shiftsize,options.d))


    #3 Call Peaks
    info("#3 Call peaks...")
    if options.nolambda:
        info("# local lambda is disabled!")

    # decide options.tocontrol according to options.tolarge
    if control:
        if options.downsample:
            # use random sampling to balance treatment and control
            info("#3 User prefers to use random sampling instead of linear scaling.")
            if t1 > c1:
                info("#3 MACS is random sampling treatment tags...")
                warn("#3 Your results may not be reproducible due to the random sampling!")
                treatment.sample_num(c1)
                info("#3 %d tags from treatment are kept" % treatment.total)                
            elif c1 > t1: 
                info("#3 MACS is random sampling control tags...")
                warn("#3 Your results may not be reproducible due to the random sampling!")                
                control.sample_num(t1)
                info("#3 %d tags from control are kept" % control.total)
            # set options.tocontrol although it would;t matter now
            options.tocontrol = False
        else:
            if options.tolarge:
                if t1 > c1:
                    # treatment has more tags than control, since tolarge is
                    # true, we will scale control to treatment.
                    options.tocontrol = False
                else:
                    # treatment has less tags than control, since tolarge is
                    # true, we will scale treatment to control.
                    options.tocontrol = True
            else:
                if t1 > c1:
                    # treatment has more tags than control, since tolarge is
                    # false, we will scale treatment to control.
                    options.tocontrol = True
                else:
                    # treatment has less tags than control, since tolarge is
                    # false, we will scale control to treatment.
                    options.tocontrol = False

    peakdetect = PeakDetect(treat = treat,
                            control = control,
                            opt = options
                            )
    peakdetect.call_peaks()
    #diag_result = peakdetect.diag_result()
    #4 output
    #4.1 peaks in XLS
    info("#4 Write output xls file... %s" % (options.peakxls))
    ofhd_xls = open(options.peakxls,"w")
    ofhd_xls.write("# This file is generated by MACS version %s\n" % (MACS_VERSION))
    ofhd_xls.write(options.argtxt+"\n")

    ofhd_xls.write(tagsinfo)

    ofhd_xls.write("# d = %d\n" % (options.d))
    if options.nolambda:
        ofhd_xls.write("# local lambda is disabled!\n")
    ofhd_xls.write(peakdetect.toxls())
    ofhd_xls.close()
    #4.2 peaks in BED
    if options.log_pvalue:
        score_column = "pscore"
    elif options.log_qvalue:
        score_column = "qscore"
    info("#4 Write peak bed file... %s" % (options.peakbed))
    ofhd_bed = open(options.peakbed,"w")
    peakdetect.peaks.write_to_bed (ofhd_bed, name_prefix="MACS_peak_", score_column=score_column)
    ofhd_bed.close()
    #4.2 peaks in narrowPeak
    info("#4 Write peak in narrowPeak format file... %s" % (options.peakNarrowPeak))
    ofhd_bed = open(options.peakNarrowPeak,"w")
    peakdetect.peaks.write_to_narrowPeak (ofhd_bed, name_prefix="MACS_peak_", score_column=score_column)
    ofhd_bed.close()
    #4.2 broad peaks in bed12
    if options.broad:
        info("#4 Write broad peak in bed12 format file... %s" % (options.peakBroadPeak))
        ofhd_bed = open(options.peakBroadPeak,"w")
        peakdetect.broadpeaks.write_to_gappedPeak (ofhd_bed, name_prefix="MACS_peak_", name=options.name, description=options.name)
        ofhd_bed.close()
    #4.2-2 summits in BED
    info("#4 Write summits bed file... %s" % (options.summitbed))
    ofhd_summits = open(options.summitbed,"w")
    peakdetect.peaks.write_to_summit_bed (ofhd_summits, name_prefix="MACS_summit_", score_column=score_column)
    ofhd_summits.close()

def prepare_optparser ():
    """Prepare optparser object. New options will be added in this
    function first.
    
    """
    usage = """usage: %prog <-t tfile> [-n name] [-g genomesize] [options]

Example: %prog -t ChIP.bam -c Control.bam -f BAM -g hs -n test -B -q 0.01

or example for broad peak calling: %prog -t ChIP.bam -c Control.bam --broad -g hs

"""
    description = "%prog -- Model-based Analysis for ChIP-Sequencing"

    optparser = OptionParser(version="%prog "+MACS_VERSION,description=description,usage=usage,add_help_option=False)
    optparser.add_option("-h","--help",action="help",help="show this help message and exit.")
    optparser.add_option("-t","--treatment",dest="tfile",type="string",
                         help="ChIP-seq treatment file. REQUIRED.")
    optparser.add_option("-c","--control",dest="cfile",type="string",
                         help="Control file.")
    optparser.add_option("-n","--name",dest="name",type="string",
                         help="Experiment name, which will be used to generate output file names. DEFAULT: \"NA\"",
                         default="NA")
    optparser.add_option("-f","--format",dest="format",type="string",
                         help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\". The default AUTO option will let MACS decide which format the file is. Please check the definition in 00README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: \"AUTO\"",
                         default="AUTO")
    optparser.add_option("-g","--gsize",dest="gsize",type="string",default="hs",
                         help="Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), Default:hs")
    optparser.add_option("-s","--tsize",dest="tsize",type="int",default=None,
                         help="Tag size. This will overide the auto detected tag size. DEFAULT: Not set")
    optparser.add_option("--bw",dest="bw",type="int",default=300,
                         help="Band width. This value is only used while building the shifting model. DEFAULT: 300")
    optparser.add_option("-q","--qvalue",dest="qvalue",type="float",default=0.01,
                         help="Minimum FDR (q-value) cutoff for peak detection. DEFAULT: 0.01 ")
    optparser.add_option("-p","--pvalue",dest="pvalue",type="float",
                         help="Pvalue cutoff for peak detection. When set (e.g. -q 0.05 or -q 1e-5), qvalue cutoff will be ignored. Default is not set.")
    optparser.add_option("-m","--mfold",dest="mfold",type="string",default="10,30",
                         help="Select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. The regions must be lower than upper limit, and higher than the lower limit. DEFAULT:10,30")
    optparser.add_option("--nolambda",dest="nolambda",action="store_true",
                         help="If True, MACS will use fixed background lambda as local lambda for every peak region. Normally, MACS calculates a dynamic local lambda to reflect the local bias due to potential chromatin structure. ",
                         default=False)
    optparser.add_option("--slocal",dest="smalllocal",type="int",default=1000,
                         help="The small nearby region in basepairs to calculate dynamic lambda. This is used to capture the bias near the peak summit region. Invalid if there is no control data. If you set this to 0, MACS will skip slocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation. The final local bias should be the maximum of the lambda value from d, slocal, and llocal size windows. DEFAULT: 1000 ")
    optparser.add_option("--llocal",dest="largelocal",type="int",default=10000,
                         help="The large nearby region in basepairs to calculate dynamic lambda. This is used to capture the surround bias. If you set this to 0, MACS will skip llocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation. The final local bias should be the maximum of the lambda value from d, slocal, and llocal size windows. DEFAULT: 10000.")
    optparser.add_option("--auto-bimodal",dest="onauto",action="store_true",
                         help="Whether turn on the auto pair model process. If set, when MACS failed to build paired model, it will use the nomodel settings, the '--shiftsize' parameter to shift and extend each tags. Not to use this automate fixation is a default behavior now. DEFAULT: False",
                         default=False)
    optparser.add_option("--nomodel",dest="nomodel",action="store_true",
                         help="Whether or not to build the shifting model. If True, MACS will not build model. by default it means shifting size = 100, try to set shiftsize to change it. DEFAULT: False",
                         default=False)
    optparser.add_option("--shiftsize",dest="shiftsize",type="int",default=100,
                         help="The arbitrary shift size in bp. When nomodel is true, MACS will use this value as 1/2 of fragment size. DEFAULT: 100 ")
    optparser.add_option("--keep-dup",dest="keepduplicates",type="string",default="auto",
                         help="It controls the MACS behavior towards duplicate tags at the exact same location -- the same coordination and the same strand. The default 'auto' option makes MACS calculate the maximum tags at the exact same location based on binomal distribution using 1e-5 as pvalue cutoff; and the 'all' option keeps every tags. If an integer is given, at most this number of tags will be kept at the same location. Default: auto")
    optparser.add_option("--to-large",dest="tolarge",action="store_true",default=False,
                         help="When set, scale the small sample up to the bigger sample. By default, the bigger dataset will be scaled down towards the smaller dataset, which will lead to smaller p/qvalues and more specific results. Keep in mind that scaling down will bring down background noise more. DEFAULT: False")
    optparser.add_option("--down-sample",dest="downsample",action="store_true",default=False,
                         help="When set, random sampling method will scale down the bigger sample. By default, MACS uses linear scaling. Warning: This option will make your result unstable and irreproducible since each time, random reads would be selected. Consider to use 'randsample' script instead. DEFAULT: False")
    optparser.add_option("--shift-control",dest="shiftcontrol",action="store_true",default=False,
                         help="When set, control tags will be shifted just as ChIP tags according to their strand before the extension of d, slocal and llocal. By default, control tags are extended centered at their current positions regardless of strand. You may consider to turn this option on while comparing two ChIP datasets of different condition but the same factor. DEFAULT: False")
    optparser.add_option("--half-ext",dest="halfext",action="store_true",default=False,
                         help="When set, MACS extends 1/2 d size for each fragment centered at its middle point. DEFAULT: False")
    optparser.add_option("-B","--bdg",dest="store_bdg",action="store_true",
                          help="Whether or not to save extended fragment pileup, local lambda and score tracks at every bp into a bedGraph file. DEFAULT: False",
                          default=False)
    optparser.add_option("--broad",dest="broad",action="store_true",
                         help="If set, MACS will try to call broad peaks by linking nearby highly enriched regions. The linking region is controlled by another cutoff through --linking-cutoff. The maximum linking region length is 4 times of d from MACS. DEFAULT: False",default=False)
    optparser.add_option("--broad-cutoff",dest="broadcutoff",type="float",default=0.1,
                         help="Cutoff for broad region. This option is not available unless --broad is set. If -p is set, this is a pvalue cutoff, otherwise, it's a qvalue cutoff. DEFAULT: 0.1 ")
    optparser.add_option("--verbose",dest="verbose",type="int",default=2,
                         help="Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2")
    return optparser
   
def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ):
    """Calculate the maximum duplicated tag number based on genome
    size, total tag number and a p-value based on binomial
    distribution. Brute force algorithm to calculate reverse CDF no
    more than MAX_LAMBDA(100000).
    
    """
    return binomial_cdf_inv(1-p,tags_number,1.0/genome_size)

def load_tag_files_options ( options ):
    """From the options, load treatment tags and control tags (if available).

    """
    options.info("#1 read treatment tags...")
    tp = options.parser(open2(options.tfile))
        
    if not options.tsize:           # override tsize if user specified --tsize
        ttsize = tp.tsize()
        options.tsize = ttsize

    treat = tp.build_fwtrack()
    treat.sort()
    if options.cfile:
        options.info("#1.2 read input tags...")
        control = options.parser(open2(options.cfile)).build_fwtrack()
        control.sort()
    else:
        control = None
    options.info("#1 tag size is determined as %d bps" % options.tsize)
    return (treat, control)

def open2(path, mode='r', bufsize=-1):
    # try gzip first
    f = gzip.open(path, mode)
    try:
        f.read(10)
    except IOError:
        # not a gzipped file
        f.close()
        f = open(path, mode, bufsize)
    else:
        f.seek(0)
    return f



if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt:
        sys.stderr.write("User interrupt me! ;-) Bye!\n")
        sys.exit(0)
macs 2.0.9.1-1 / usr / bin / macs2