This file is indexed.

/usr/bin/bp_blast2tree is in bioperl 1.7.1-2.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/perl 
# Author:  Jason Stajich <jason@bioperl.org>
# Purpose: Blast Report -> MSA -> Tree 


# This needs lots more error checking, cmdline input of parameters
# and support for other treebuilding -- only Phylip Neighbor for 
# protein data is supported

# Also proper pulling in of the correct sequence data from the
# alignment - multiple hits on different parts of a protein aren't
# going to work properly right now.  So this is mostly and example
# starting point which needs a lot more work to be made robust.

use strict;
use warnings;
use Bio::AlignIO;
use Bio::Tools::Run::Alignment::Clustalw;
use Bio::Tools::Run::Phylo::Phylip::ProtDist;
use Bio::Tools::Run::Phylo::Phylip::Neighbor;
use Bio::Tools::Run::Phylo::Molphy::ProtML;
use Bio::Tools::Run::Phylo::Phylip::ProtPars;
use Bio::SearchIO;
use Bio::SimpleAlign;
use Bio::PrimarySeq;
use Bio::TreeIO;
use Getopt::Long;

my $IDLENGTH = 12;


# we could in fact pull the tree out of the guide tree calculated
# by Clustalw in the alignment, but I believe that is UPGMA 
# which would *NOT* be a good thing to be giving people.

my $aln_factory = Bio::Tools::Run::Alignment::Clustalw->new
    ('ktuple' => 2, "quiet" => 1,
     'matrix' => 'BLOSUM');
my ($report,$format,$tree_method,$cutoff,$keepall);

$format = 'blast';
$tree_method = 'neighbor';
$cutoff = '0.01';
GetOptions(
	   'h|help'       => sub { exec('perldoc', $0);
				   exit(0); },
	   'i|input:s'    => \$report,
	   'f|format:s'   => \$format,
	   'm|method:s'   => \$tree_method,
	   'e|evalue:s'   => \$cutoff,
	   'k|keepall:s'  => \$keepall, # keep all HSPs not just best
	   );


unless( $format =~ /blast|fasta|hmmer/i ) {
    die("Must request a valid search report format (fasta,blast,hmmer)");
}

unless ( $tree_method =~ /nj|neighbor/i || $tree_method =~ /protml|ml/i ) {
    die("Must request a valid tree building method (neighbor,protml)");
} 

my (@alns,@seqs);

my $in = new Bio::SearchIO(-file   => $report,
			   -format => $format);
while( my $r = $in->next_result ) {
    # Let's build the simplest system first
    die("Cannot process report that does not contain protein sequence") 
	unless ($r->algorithm =~ /HMMER|BLASTP|FASTP/i );
    my @seqs;
    while( my $hit = $r->next_hit ) {
	next if $hit->significance > $cutoff;
	while( my $hsp = $hit->next_hsp ) {
	    next if $hsp->evalue > $cutoff;
	    my $seq = $hsp->get_aln->get_seq_by_pos(2)->seq();
	    push @seqs, new Bio::PrimarySeq(-seq => $seq,
					    -id  => $hsp->hit->seq_id,
					    -desc => $r->algorithm . " best align to ". $hsp->query->seq_id );
	    last unless $keepall;
	}
    }
    push @alns, $aln_factory->align(\@seqs);    
}

my $out = new Bio::AlignIO(-format => 'phylip',
			   -interleaved => 1,
			   -idlength => $IDLENGTH,
			   -file => ">alignfile.phy");
$out->write_aln(@alns);
$out = undef;

# these need to be parameterized for cmdline arguments
my @params = ('idlength'=>$IDLENGTH,
	      'model'=>'cat',
	      'gencode'=>'U',
	      'category'=>'H',
	      'probchange'=>'0.4',
	      'trans'=>'5',
	      'freq'=>'0.25,0.5,0.125,0.125');
my $dist_factory = Bio::Tools::Run::Phylo::Phylip::ProtDist->new(@params);
$dist_factory->quiet(1);
@params = ('type'=>'NJ',
	   'outgroup'=>1,
	   'upptri'=>1,
	   'jumble'=>17);

my $tree_factory = Bio::Tools::Run::Phylo::Phylip::Neighbor->new(@params);
$tree_factory->quiet(1);
my $count = 1;
my $outtrees = new Bio::TreeIO(-file => ">trees.tre",
			       -format => 'newick');
foreach my $aln ( @alns ) {
# NOTE NOTE NOTE 
# This interface is probably going to change from create_tree to
# next_tree per some discussions I'm having with Shawn - we may need
# to tweak any scripts before you publish

# also may move the create_distance_matrix method around some
# and need to write in the switched support for Molphy's ProtML
    
    my $matrix = $dist_factory->create_distance_matrix($aln);
    my @seqnames = keys %$matrix; 
    open my $MATRIX, '>', "Group$count.dist" or die "Could not write file 'Group$count.dist': $!\n";
    printf $MATRIX "%4d\n",scalar @seqnames;
    for(my $i =0; $i< (scalar @seqnames  - 1); $i++ ) {
	printf $MATRIX "%-12s     ", $seqnames[$i];
	for( my $j = $i+1; $j < scalar @seqnames; $j++ ) {
	    print $MATRIX $matrix->{$seqnames[$i]}->{$seqnames[$j]},"  ";
	}
	print $MATRIX "\n";
    }
    close $MATRIX;

    my $tree = $tree_factory->create_tree("Group$count.dist");
    $outtrees->write_tree($tree);   
    $count++;
}

=head1 NAME

tree_from_seqsearch - builds a phylogenetic tree based on a sequence
search (FastA,BLAST,HMMER)

=head1 DESCRIPTION

This script requires that the bioperl-run pkg be also installed.

=cut