/usr/share/pyshared/cogent/app/raxml.py is in python-cogent 1.5.1-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 | #!/usr/bin/env python
"""Application controller for RAxML (v7.0.3).
WARNING: Because of the use of the -x option, this version is no longer
compatible with RAxML version VI.
"""
from cogent.app.parameters import FlagParameter, ValuedParameter, FilePath
from cogent.app.util import CommandLineApplication, ResultPath, get_tmp_filename
from cogent.core.tree import PhyloNode
from cogent.core.alignment import Alignment
from cogent.core.moltype import DNA, RNA, PROTEIN
from random import choice, randint
from os import walk
from os.path import isabs
from cogent.parse.tree import DndParser
__author__ = "Micah Hamady"
__copyright__ = "Copyright 2007-2011, The Cogent Project"
__credits__ = ["Micah Hamady", "Catherine Lozupone", "Rob Knight", \
"Daniel McDonald"]
__license__ = "GPL"
__version__ = "1.5.1"
__maintainer__ = "Micah Hamady"
__email__ = "hamady@colorado.edu"
__status__ = "Prototype"
class Raxml(CommandLineApplication):
"""RAxML application controller"""
_options ={
# Specify a column weight file name to assign individual wieghts to
# each column of the alignment. Those weights must be integers
# separated by any number and type of whitespaces whithin a separate
# file, see file "example_weights" for an example.
'-a':ValuedParameter('-',Name='a',Delimiter=' '),
# Specify an integer number (random seed) for bootstrapping
'-b':ValuedParameter('-',Name='b',Delimiter=' '),
# Specify number of distinct rate catgories for raxml when
# ModelOfEvolution is set to GTRCAT or HKY85CAT.
# Individual per-site rates are categorized into numberOfCategories
# rate categories to accelerate computations. (Default = 50)
'-c':ValuedParameter('-',Name='c',Delimiter=' ', Value=50),
# This option allows you to start the RAxML search with a complete
# random starting tree instead of the default Maximum Parsimony
# Starting tree. On smaller datasets (around 100-200 taxa) it has
# been observed that this might sometimes yield topologies of distinct
# local likelihood maxima which better correspond to empirical
# expectations.
'-d':FlagParameter('-',Name='d'),
# This allows you to specify up to which likelihood difference.
# Default is 0.1 log likelihood units, author recommends 1 or 2 to
# rapidly evaluate different trees.
'-e':ValuedParameter('-',Name='e',Delimiter=' ', Value=0.1),
# select search algorithm:
# d for normal hill-climbing search (Default)
# when -f option is omitted this algorithm will be used
# o old (slower) algorithm from v. 2.1.3
# c (check) just tests whether it can read the alignment
# e (evaluate) to optimize model+branch lengths for given input tree
# b (bipartition) draws bipartitions
# s (split) splits into individual genes, provided with model file
'-f':ValuedParameter('-',Name='f',Delimiter=' ', Value="d"),
# select grouping file name: allows incomplete multifurcating constraint
# tree in newick format -- resolves multifurcations randomly, adds
# other taxa using parsimony insertion
'-g':ValuedParameter('-', Name='g',Delimiter=' '),
# prints help and exits
'-h':FlagParameter('-', Name='h'),
# allows initial rearrangement to be constrained, e.g. 10 means
# insertion will not be more than 10 nodes away from original.
# default is to pick a "good" setting.
'-i':ValuedParameter('-', Name='i', Delimiter=' '),
# writes checkpoints (off by default)
'-j':FlagParameter('-', Name='j'),
#specifies that RAxML will optimize model parameters (for GTRMIX and
# GTRGAMMA) as well as calculating likelihoods for bootstrapped trees.
'-k':FlagParameter('-', Name='k'),
# Model of Nucleotide Substitution:
# -m GTRGAMMA: GTR + Optimization of substitution rates + Gamma
# -m GTRCAT: GTR + Optimization of substitution rates + Optimization
# of site-specific evolutionary rates which are categorized into
# numberOfCategories distinct rate categories for greater
# computational efficiency
# -m GTRMIX: Searches for GTRCAT, then switches to GTRGAMMA
# Amino Acid Models
# matrixName (see below): DAYHOFF, DCMUT, JTT, MTREV, WAG, RTREV,
# CPREV, VT, BLOSUM62, MTMAM, GTR.
# F means use empirical nucleotide frequencies (append to string)
# -m PROTCATmatrixName[F]: uses site-specific rate categories
# -m PROTGAMMAmatrixName[F]: uses Gamma
# -m PROTMIXmatrixName[F]: switches between gamma and cat models
# e.g. -m PROTCATBLOSUM62F would use protcat with BLOSUM62 and
# empirical frequencies
'-m':ValuedParameter('-',Name='m',Delimiter=' '),
# Specifies the name of the output file.
'-n':ValuedParameter('-',Name='n',Delimiter=' '),
# Specifies the name of the outgroup (or outgroups: comma-delimited,
# no spaces, should be monophyletic).
'-o':ValuedParameter('-',Name='o',Delimiter=' '),
# Specified MultipleModel file name, in format:
# gene1 = 1-500
# gene2 = 501-1000
# (note: ranges can also be discontiguous, e.g. 1-100, 200-300,
# or can specify codon ranges as e.g. 1-100/3, 2-100/3, 3-100/3))
'-q':ValuedParameter('-', Name='q', Delimiter=' '),
# Name of the working directory where RAxML-V will write its output
# files.
'-w':ValuedParameter('-',Name='w',Delimiter=' '),
# Constraint file name: allows a bifurcating Newick tree to be passed
# in as a constraint file, other taxa will be added by parsimony.
'-r':ValuedParameter('-',Name='r',Delimiter=' '),
# Specify a random number seed for the parsimony inferences. This
# allows you to reproduce your results and will help me debug the
# program. This option HAS NO EFFECT in the parallel MPI version
'-p':ValuedParameter('-',Name='p',Delimiter=' '),
# specify the name of the alignment data file, in relaxed PHYLIP
# format.
'-s':ValuedParameter('-',Name='s',Delimiter=' '),
# Specify a user starting tree file name in Newick format
'-t':ValuedParameter('-',Name='t',Delimiter=' '),
# Print the version
'-v':FlagParameter('-',Name='v'),
# Compute only randomized starting parsimony tree with RAxML, do not
# optimize an ML analysis of the tree
'-y':FlagParameter('-', Name='y'),
# Multiple tree file, for use with -f b (to draw bipartitions onto the
# common tree specified with -t)
'-z':ValuedParameter('-', Name='z', Delimiter=' '),
# Specifies number of runs on distinct starting trees.
'-#':ValuedParameter('-', Name='#', Delimiter=' '),
#Specify an integer number (random seed) to turn on rapid bootstrapping
'-x':ValuedParameter('-', Name='x', Delimiter=' ')
}
_parameters = {}
_parameters.update(_options)
_command = "raxmlHPC"
_out_format = "RAxML_%s.%s"
def _format_output(self, outfile_name, out_type):
""" Prepend proper output prefix to output filename """
outfile_name = self._absolute(outfile_name)
outparts = outfile_name.split("/")
outparts[-1] = self._out_format % (out_type, outparts[-1] )
return '/'.join(outparts)
def _input_as_seqs(self,data):
lines = []
for i,s in enumerate(data):
#will number the sequences 1,2,3,etc.
lines.append(''.join(['>',str(i+1)]))
lines.append(s)
return self._input_as_lines(lines)
def _input_as_lines(self,data):
if data:
self.Parameters['-s']\
.on(super(Raxml,self)._input_as_lines(data))
return ''
def _input_as_string(self,data):
"""Makes data the value of a specific parameter
This method returns the empty string. The parameter will be printed
automatically once set.
"""
if data:
self.Parameters['-in'].on(str(data))
return ''
def _input_as_multiline_string(self, data):
if data:
self.Parameters['-s']\
.on(super(Raxml,self)._input_as_multiline_string(data))
return ''
def _absolute(self,path):
path = FilePath(path)
if isabs(path):
return path
elif self.Parameters['-w'].isOn():
return self.Parameters['-w'].Value + path
else:
return self.WorkingDir + path
def _log_out_filename(self):
if self.Parameters['-n'].isOn():
return self._format_output(str(self.Parameters['-n'].Value), "log")
else:
raise ValueError, "No output file specified."
def _info_out_filename(self):
if self.Parameters['-n'].isOn():
return self._format_output(str(self.Parameters['-n'].Value), "info")
else:
raise ValueError, "No output file specified."
def _parsimony_tree_out_filename(self):
if self.Parameters['-n'].isOn():
return self._format_output(str(self.Parameters['-n'].Value), "parsimonyTree")
else:
raise ValueError, "No output file specified."
def _result_tree_out_filename(self):
if self.Parameters['-n'].isOn():
return self._format_output(str(self.Parameters['-n'].Value), "result")
else:
raise ValueError, "No output file specified."
def _result_bootstrap_out_filename(self):
if self.Parameters['-n'].isOn():
return self._format_output(str(self.Parameters['-n'].Value), \
"bootstrap")
else:
raise ValueError, "No output file specified"
def _checkpoint_out_filenames(self):
"""
RAxML generates a crapload of checkpoint files so need to
walk directory to collect names of all of them.
"""
out_filenames = []
if self.Parameters['-n'].isOn():
out_name = str(self.Parameters['-n'].Value)
walk_root = self.WorkingDir
if self.Parameters['-w'].isOn():
walk_root = str(self.Parameters['-w'].Value)
for tup in walk(walk_root):
dpath, dnames, dfiles = tup
if dpath == walk_root:
for gen_file in dfiles:
if out_name in gen_file and "checkpoint" in gen_file:
out_filenames.append(walk_root + gen_file)
break
else:
raise ValueError, "No output file specified."
return out_filenames
def _get_result_paths(self,data):
result = {}
result['Info'] = ResultPath(Path=self._info_out_filename(),
IsWritten=True)
if self.Parameters['-k'].isOn():
result['Bootstrap'] = ResultPath(
Path=self._result_bootstrap_out_filename(),
IsWritten=True)
else:
result['Log'] = ResultPath(Path=self._log_out_filename(),
IsWritten=True)
result['ParsimonyTree'] = ResultPath(
Path=self._parsimony_tree_out_filename(),
IsWritten=True)
result['Result'] = ResultPath(
Path=self._result_tree_out_filename(),
IsWritten=True)
for checkpoint_file in self._checkpoint_out_filenames():
checkpoint_num = checkpoint_file.split(".")[-1]
try:
checkpoint_num = int(checkpoint_num)
except Exception, e:
raise ValueError, "%s does not appear to be a valid checkpoint file"
result['Checkpoint%d' % checkpoint_num] = ResultPath(
Path=checkpoint_file,
IsWritten=True)
return result
#SOME FUNCTIONS TO EXECUTE THE MOST COMMON TASKS
def raxml_alignment(align_obj,
raxml_model="GTRCAT",
params={},
SuppressStderr=True,
SuppressStdout=True):
"""Run raxml on alignment object
align_obj: Alignment object
params: you can set any params except -w and -n
returns: tuple (phylonode,
parsimonyphylonode,
log likelihood,
total exec time)
"""
# generate temp filename for output
params["-w"] = "/tmp/"
params["-n"] = get_tmp_filename().split("/")[-1]
params["-m"] = raxml_model
ih = '_input_as_multiline_string'
seqs, align_map = align_obj.toPhylip()
#print params["-n"]
# set up command
raxml_app = Raxml(
params=params,
InputHandler=ih,
WorkingDir=None,
SuppressStderr=SuppressStderr,
SuppressStdout=SuppressStdout)
# run raxml
ra = raxml_app(seqs)
# generate tree
tree_node = DndParser(ra["Result"])
# generate parsimony tree
parsimony_tree_node = DndParser(ra["ParsimonyTree"])
# extract log likelihood from log file
log_file = ra["Log"]
total_exec_time = exec_time = log_likelihood = 0.0
for line in log_file:
exec_time, log_likelihood = map(float, line.split())
total_exec_time += exec_time
# remove output files
ra.cleanUp()
return tree_node, parsimony_tree_node, log_likelihood, total_exec_time
def build_tree_from_alignment(aln, moltype, best_tree=False, params={}):
"""Returns a tree from Alignment object aln.
aln: an xxx.Alignment object, or data that can be used to build one.
moltype: cogent.core.moltype.MolType object
best_tree: best_tree suppport is currently not implemented
params: dict of parameters to pass in to the RAxML app controller.
The result will be an xxx.Alignment object, or None if tree fails.
"""
if best_tree:
raise NotImplementedError
if '-m' not in params:
if moltype == DNA or moltype == RNA:
params["-m"] = 'GTRMIX'
elif moltype == PROTEIN:
params["-m"] = 'PROTMIXGTR'
else:
raise ValueError, "Moltype must be either DNA, RNA, or PROTEIN"
if not hasattr(aln, 'toPhylip'):
aln = Alignment(aln)
seqs, align_map = aln.toPhylip()
# generate temp filename for output
params["-w"] = "/tmp/"
params["-n"] = get_tmp_filename().split("/")[-1]
params["-k"] = True
params["-x"] = randint(1,100000)
ih = '_input_as_multiline_string'
raxml_app = Raxml(params=params,
InputHandler=ih,
WorkingDir=None,
SuppressStderr=True,
SuppressStdout=True)
raxml_result = raxml_app(seqs)
tree = DndParser(raxml_result['Bootstrap'], constructor=PhyloNode)
for node in tree.tips():
node.Name = align_map[node.Name]
raxml_result.cleanUp()
return tree
|