/usr/share/hhsuite/scripts/splitfasta.pl is in hhsuite 3.0~beta2+dfsg-3.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | #!/usr/bin/perl
# splitfasta.pl
# Split a file with multiple, FASTA formatted sequences into many single-sequence FASTA files
#
# (C) Johannes Soeding, 2012
#
# HHsuite version 3.0.0 (15-03-2015)
#
# Reference:
# Remmert M., Biegert A., Hauser A., and Soding J.
# HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
# Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# We are very grateful for bug reports! Please contact us at soeding@mpibpc.mpg.de
use lib ( $ENV{"HHLIB"} || '/usr/share/hhsuite' )."/scripts";
use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
use strict;
use warnings;
my $ext="seq";
my $usage="
splitfasta.pl from HHsuite $VERSION
Split a file with multiple, FASTA formatted sequences into multiple single-sequence FASTA files.
Write files into current directory and name each file by the first word after \">\" in the name line.
Usage: splitfasta.pl infile [option]
Option:
-fam : use family-based name (for SCOP/ASTRAL sequences
-name : use sequence name as file name (default)
-ext <ext> : extension for sequence files (default=$ext)
\n";
if (@ARGV<1) {die $usage;;}
my $line;
my $infile=$ARGV[0];
my $outfile;
my $sequence="";
my $options="";
my $fam=0; # option -fam?
my $famid="";
my %numfams=();
my $n=0; # number of name lines read in so far
if (@ARGV>1) {
$options.=join(" ",@ARGV[1..$#ARGV]);
}
# Set number of cpus to use
if ($options=~s/-fam//g) {$fam=1;}
if ($options=~s/-name//g) {$fam=0;}
if ($options=~s/-ext\s+(\S+)//g) {$ext=$1;}
open (INFILE,"<$infile") || die("ERROR: Can't open $infile: $!\n");
if ($fam) {
while ($line=<INFILE>) {
if ($line=~/^>(\S+)\s+(\S+)/) {
$famid=$2;
if ($n) {
open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
print(OUTFILE $sequence);
close(OUTFILE);
}
if (defined $numfams{$fam}) {$numfams{$fam}++;} else {$numfams{$fam}=1};
$outfile="$fam.".$numfams{$fam}.".seq";
$sequence=$line;
$n++;
} else {
$sequence.=$line;
}
}
if ($n) {
open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
print(OUTFILE $sequence);
close(OUTFILE);
}
} else {
my %exists=();
while ($line=<INFILE>) {
if ($line=~/^>(\S+)/) {
if ($n) {
open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
print(OUTFILE $sequence);
close(OUTFILE);
}
if ($exists{$1}) {print("\nWarning: id $1 appears more than once in $infile\n");}
$exists{$1}=1;
my $tmp = $1;
$tmp =~ s/\|/_/g;
$tmp =~ s/\./_/g;
$outfile="$tmp.$ext";
$sequence=$line;
$n++;
} else {
$sequence.=$line;
}
}
if ($n) {
open (OUTFILE,">$outfile") || die("ERROR: Can't open $outfile: $!\n");
print(OUTFILE $sequence);
close(OUTFILE);
}
}
close(INFILE);
printf("Created %i sequence files\n",$n);
|