/usr/bin/bp_blast2tree is in bioperl 1.7.1-2.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | #!/usr/bin/perl
# Author: Jason Stajich <jason@bioperl.org>
# Purpose: Blast Report -> MSA -> Tree
# This needs lots more error checking, cmdline input of parameters
# and support for other treebuilding -- only Phylip Neighbor for
# protein data is supported
# Also proper pulling in of the correct sequence data from the
# alignment - multiple hits on different parts of a protein aren't
# going to work properly right now. So this is mostly and example
# starting point which needs a lot more work to be made robust.
use strict;
use warnings;
use Bio::AlignIO;
use Bio::Tools::Run::Alignment::Clustalw;
use Bio::Tools::Run::Phylo::Phylip::ProtDist;
use Bio::Tools::Run::Phylo::Phylip::Neighbor;
use Bio::Tools::Run::Phylo::Molphy::ProtML;
use Bio::Tools::Run::Phylo::Phylip::ProtPars;
use Bio::SearchIO;
use Bio::SimpleAlign;
use Bio::PrimarySeq;
use Bio::TreeIO;
use Getopt::Long;
my $IDLENGTH = 12;
# we could in fact pull the tree out of the guide tree calculated
# by Clustalw in the alignment, but I believe that is UPGMA
# which would *NOT* be a good thing to be giving people.
my $aln_factory = Bio::Tools::Run::Alignment::Clustalw->new
('ktuple' => 2, "quiet" => 1,
'matrix' => 'BLOSUM');
my ($report,$format,$tree_method,$cutoff,$keepall);
$format = 'blast';
$tree_method = 'neighbor';
$cutoff = '0.01';
GetOptions(
'h|help' => sub { exec('perldoc', $0);
exit(0); },
'i|input:s' => \$report,
'f|format:s' => \$format,
'm|method:s' => \$tree_method,
'e|evalue:s' => \$cutoff,
'k|keepall:s' => \$keepall, # keep all HSPs not just best
);
unless( $format =~ /blast|fasta|hmmer/i ) {
die("Must request a valid search report format (fasta,blast,hmmer)");
}
unless ( $tree_method =~ /nj|neighbor/i || $tree_method =~ /protml|ml/i ) {
die("Must request a valid tree building method (neighbor,protml)");
}
my (@alns,@seqs);
my $in = new Bio::SearchIO(-file => $report,
-format => $format);
while( my $r = $in->next_result ) {
# Let's build the simplest system first
die("Cannot process report that does not contain protein sequence")
unless ($r->algorithm =~ /HMMER|BLASTP|FASTP/i );
my @seqs;
while( my $hit = $r->next_hit ) {
next if $hit->significance > $cutoff;
while( my $hsp = $hit->next_hsp ) {
next if $hsp->evalue > $cutoff;
my $seq = $hsp->get_aln->get_seq_by_pos(2)->seq();
push @seqs, new Bio::PrimarySeq(-seq => $seq,
-id => $hsp->hit->seq_id,
-desc => $r->algorithm . " best align to ". $hsp->query->seq_id );
last unless $keepall;
}
}
push @alns, $aln_factory->align(\@seqs);
}
my $out = new Bio::AlignIO(-format => 'phylip',
-interleaved => 1,
-idlength => $IDLENGTH,
-file => ">alignfile.phy");
$out->write_aln(@alns);
$out = undef;
# these need to be parameterized for cmdline arguments
my @params = ('idlength'=>$IDLENGTH,
'model'=>'cat',
'gencode'=>'U',
'category'=>'H',
'probchange'=>'0.4',
'trans'=>'5',
'freq'=>'0.25,0.5,0.125,0.125');
my $dist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
$dist_factory->quiet(1);
@params = ('type'=>'NJ',
'outgroup'=>1,
'upptri'=>1,
'jumble'=>17);
my $tree_factory = Bio::Tools::Run::Phylo::Phylip::Neighbor->new(@params);
$tree_factory->quiet(1);
my $count = 1;
my $outtrees = new Bio::TreeIO(-file => ">trees.tre",
-format => 'newick');
foreach my $aln ( @alns ) {
# NOTE NOTE NOTE
# This interface is probably going to change from create_tree to
# next_tree per some discussions I'm having with Shawn - we may need
# to tweak any scripts before you publish
# also may move the create_distance_matrix method around some
# and need to write in the switched support for Molphy's ProtML
my $matrix = $dist_factory->create_distance_matrix($aln);
my @seqnames = keys %$matrix;
open my $MATRIX, '>', "Group$count.dist" or die "Could not write file 'Group$count.dist': $!\n";
printf $MATRIX "%4d\n",scalar @seqnames;
for(my $i =0; $i< (scalar @seqnames - 1); $i++ ) {
printf $MATRIX "%-12s ", $seqnames[$i];
for( my $j = $i+1; $j < scalar @seqnames; $j++ ) {
print $MATRIX $matrix->{$seqnames[$i]}->{$seqnames[$j]}," ";
}
print $MATRIX "\n";
}
close $MATRIX;
my $tree = $tree_factory->create_tree("Group$count.dist");
$outtrees->write_tree($tree);
$count++;
}
=head1 NAME
tree_from_seqsearch - builds a phylogenetic tree based on a sequence
search (FastA,BLAST,HMMER)
=head1 DESCRIPTION
This script requires that the bioperl-run pkg be also installed.
=cut
|