/usr/bin/bp_search2gff is in bioperl 1.6.924-3.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 | #!/usr/bin/perl
eval 'exec /usr/bin/perl -S $0 ${1+"$@"}'
if 0; # not running under some shell
=head1 NAME
bp_search2gff
=head1 SYNOPSIS
Usage:
bp_search2gff [-o outputfile] [-f reportformat] [-i inputfilename] OR file1 file2 ..
=head1 DESCRIPTION
This script will turn a SearchIO report (BLAST, FASTP, SSEARCH,
AXT, WABA) into GFF.
The options are:
-i infilename - (optional) inputfilename, will read
either ARGV files or from STDIN
-o filename - the output filename [default STDOUT]
-f format - search result format (blast, fasta,waba,axt)
(ssearch is fasta format). default is blast.
-t/--type seqtype - if you want to see query or hit information
in the GFF report
-s/--source - specify the source (will be algorithm name
otherwise like BLASTN)
--method - the method tag (primary_tag) of the features
(default is similarity)
--scorefunc - a string or a file that when parsed evaluates
to a closure which will be passed a feature
object and that returns the score to be printed
--locfunc - a string or a file that when parsed evaluates
to a closure which will be passed two
features, query and hit, and returns the
location (Bio::LocationI compliant) for the
GFF3 feature created for each HSP; the closure
may use the clone_loc() and create_loc()
functions for convenience, see their PODs
--onehsp - only print the first HSP feature for each hit
-p/--parent - the parent to which HSP features should refer
if not the name of the hit or query (depending
on --type)
--target/--notarget - whether to always add the Target tag or not
-h - this help menu
--version - GFF version to use (put a 3 here to use gff 3)
--component - generate GFF component fields (chromosome)
-m/--match - generate a 'match' line which is a container
of all the similarity HSPs
--addid - add ID tag in the absence of --match
-c/--cutoff - specify an evalue cutoff
Additionally specify the filenames you want to process on the
command-line. If no files are specified then STDIN input is assumed.
You specify this by doing: bp_search2gff E<lt> file1 file2 file3
=head1 AUTHOR
Jason Stajich, jason-at-bioperl-dot-org
=head1 Contributors
Hilmar Lapp, hlapp-at-gmx-dot-net
=cut
use strict;
use warnings;
use Bio::Tools::GFF;
use Getopt::Long;
use Bio::SearchIO;
use Bio::Location::Simple; # pre-declare to simplify $locfunc implementations
use Bio::Location::Atomic; # pre-declare to simplify $locfunc implementations
use Storable qw(dclone); # for cloning location objects
use Bio::Factory::FTLocationFactory;
my (
$output, # output file (if not stdout)
$input, # name of the input file
$format, # format of the input file, defauly is blast
$type, # 'query' or 'hit'
$cutoff, # cut-off value for e-value filter
$sourcetag, # explicit source tag (will be taken from program
# otherwise
$methodtag, # primary tag (a.k.a. method), default 'similarity'
$gffver, # GFF version (dialect) to write
$scorefunc, # closure returning the score for a passed feature
$locfunc, # closure returning a location object for a passed
# query and hit feature
$addid, # flag: whether to always add the ID for $match == 0
$parent, # the name of the parent to use; if set and $match == 0
# will always add the target
$comp, # flag: whether to print a component feature
$addtarget, # flag: whether to always add the Target tag, default
# is true
$match, # flag: whether to print match lines as containers
$onehsp, # flag: whether to consider only the first HSP for a hit
$quiet, # flag: run quietly
$help # flag: show help screen
);
# set defaults:
$format = 'blast';
$type = 'query';
$gffver = 2;
$methodtag = "similarity";
$addtarget = 1;
GetOptions(
'i|input:s' => \$input,
'component' => \$comp,
'm|match' => \$match,
'o|output:s' => \$output,
'f|format:s' => \$format,
's|source:s' => \$sourcetag,
'method=s' => \$methodtag,
'addid' => \$addid,
'scorefunc=s' => \$scorefunc,
'locfunc=s' => \$locfunc,
'p|parent=s' => \$parent,
'target!' => \$addtarget,
'onehsp' => \$onehsp,
't|type:s' => \$type,
'c|cutoff:s' => \$cutoff,
'v|version:i' => \$gffver,
'q|quiet' => \$quiet,
'h|help' => sub {
exec( 'perldoc', $0 );
exit(0);
},
);
$type = lc($type);
if ( $type =~ /target/ ) { $type = 'hit' }
elsif ( $type ne 'query' && $type ne 'hit' ) {
die("seqtype must be either 'query' or 'hit'");
}
# custom or default function returning the score
$scorefunc =
defined($scorefunc) ? parse_code($scorefunc) : sub { shift->score };
# custom or default function returning the location
$locfunc = defined($locfunc) ? parse_code($locfunc) : sub { shift->location };
# if --match is given then $addid needs to be disabled
$addid = undef if $addid && $match;
# if no input is provided STDIN will be used
my $parser = new Bio::SearchIO(
-format => $format,
-verbose => $quiet ? -1 : 0,
-file => $input
);
my $out;
if ( defined $output ) {
$out = new Bio::Tools::GFF(
-gff_version => $gffver,
-file => ">$output"
);
}
else {
$out = new Bio::Tools::GFF( -gff_version => $gffver ); # STDOUT
}
my ( %seen_hit, %seen );
my $other = $type eq 'query' ? 'hit' : 'query';
while ( my $result = $parser->next_result ) {
my $qname = $result->query_name;
if ( $comp
&& $type eq 'query'
&& $result->query_length )
{
$out->write_feature(
Bio::SeqFeature::Generic->new(
-start => 1,
-end => $result->query_length,
-seq_id => $qname,
-source_tag => 'chromosome',
-primary_tag => 'Component',
-tag => {
'Sequence' => $qname
}
)
);
}
while ( my $hit = $result->next_hit ) {
next if ( defined $cutoff && $hit->significance > $cutoff );
my $acc = $qname;
if ( $seen{ $qname . "-" . $hit->name }++ ) {
$acc = $qname . "-" . $seen{ $qname . '-' . $hit->name };
}
if ( $comp
&& $type eq 'hit'
&& $hit->length
&& !$seen_hit{ $hit->name }++ )
{
$out->write_feature(
Bio::SeqFeature::Generic->new(
-start => 1,
-end => $hit->length,
-seq_id => $hit->name,
-source_tag => 'chromosome',
-primary_tag => 'Component',
-tag => {
'Sequence' => $hit->name
}
)
);
}
my ( %min, %max, $seqid, $name, $st );
while ( my $hsp = $hit->next_hsp ) {
my $feature = new Bio::SeqFeature::Generic;
my ( $proxyfor, $otherf );
if ( $type eq 'query' ) {
( $proxyfor, $otherf ) = ( $hsp->query, $hsp->hit );
$name ||= $hit->name;
}
else {
( $otherf, $proxyfor ) = ( $hsp->query, $hsp->hit );
$name ||= $acc;
}
$proxyfor->score( $hit->bits ) unless ( $proxyfor->score );
if ( ( $gffver == 3 ) && ( $match || $parent ) ) {
$feature->add_tag_value( 'Parent', $parent || $name );
}
$min{$type} = $proxyfor->start
unless defined $min{$type} && $min{$type} < $proxyfor->start;
$max{$type} = $proxyfor->end
unless defined $max{$type} && $max{$type} > $proxyfor->end;
$min{$other} = $otherf->start
unless defined $min{$other} && $min{$other} < $otherf->start;
$max{$other} = $otherf->end
unless defined $max{$other} && $max{$other} > $otherf->end;
if ( $addtarget || $match ) {
$feature->add_tag_value( 'Target', 'Sequence:' . $name );
$feature->add_tag_value( 'Target', $otherf->start );
$feature->add_tag_value( 'Target', $otherf->end );
}
if ($addid) {
$feature->add_tag_value( 'ID', $name );
}
$feature->location( &$locfunc( $proxyfor, $otherf ) );
# strand for feature is always going to be product of
# query & hit strands so that target can always be just
# '+'
$feature->strand( $proxyfor->strand * $otherf->strand );
if ($sourcetag) {
$feature->source_tag($sourcetag);
}
else {
$feature->source_tag( $proxyfor->source_tag );
}
$feature->score( &$scorefunc($proxyfor) );
$feature->frame( $proxyfor->frame );
$feature->seq_id( $proxyfor->seq_id );
$feature->primary_tag($methodtag);
# add annotation if encoded in the query description
my $desc = $result->query_description;
while ( $desc =~ /\/([^=]+)=(\S+)/g ) {
$feature->add_tag_value( $1, $2 );
}
$seqid ||= $proxyfor->seq_id;
$out->write_feature($feature);
$st ||= $sourcetag || $proxyfor->source_tag;
last if $onehsp;
}
if ($match) {
my $matchf = Bio::SeqFeature::Generic->new(
-start => $min{$type},
-end => $max{$type},
-strand => $hit->strand($type) * $hit->strand($other),
-primary_tag => 'match',
-source_tag => $st,
-score => $hit->bits,
-seq_id => $seqid
);
if ( $gffver == 3 ) {
$matchf->add_tag_value( 'ID', $name );
}
$matchf->add_tag_value( 'Target', "Sequence:$name" );
$out->write_feature($matchf);
}
}
}
sub parse_code {
my $src = shift;
my $code;
# file or subroutine?
if ( -r $src ) {
if ( !( ( $code = do $src ) && ( ref($code) eq "CODE" ) ) ) {
die "error in parsing code block $src: $@" if $@;
die "unable to read file $src: $!" if $!;
die "failed to run $src, or it failed to return a closure";
}
}
else {
$code = eval $src;
die "error in parsing code block \"$src\": $@" if $@;
die "\"$src\" fails to return a closure"
unless ref($code) eq "CODE";
}
return $code;
}
=head2 clone_loc
Title : clone_loc
Usage : my $l = clone_loc($feature->location);
Function: Helper function to simplify the task of cloning locations
for --locfunc closures.
Presently simply implemented using Storable::dclone().
Example :
Returns : A L<Bio::LocationI> object of the same type and with the
same properties as the argument, but physically different.
All structured properties will be cloned as well.
Args : A L<Bio::LocationI> compliant object
=cut
sub clone_loc {
return dclone(shift);
}
=head2 create_loc
Title : create_loc
Usage : my $l = create_loc("10..12");
Function: Helper function to simplify the task of creating locations
for --locfunc closures. Creates a location from a feature-
table formatted string.
Example :
Returns : A L<Bio::LocationI> object representing the location given
as formatted string.
Args : A GenBank feature-table formatted string.
=cut
sub create_loc {
return Bio::Factory::FTLocationFactory->from_string(shift);
}
|