/usr/share/augustus/scripts/peptides2hints.pl is in augustus 3.3+dfsg-2build1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | #!/usr/bin/perl
# Katharina J. Hoff
# January 2012
#
# Evaluate a psl-file that contains peptide 2 protein mappings
# in context with a gff-file of the protein coding genes
# or a six frame translation (gff!) that
# the peptides were mapped against.
#
# Assumptions:
# The gtf file was sorted by startCoord column, e.g.:
# cat in.gtf | sort -n -k4 > out.gtf
# The gtf file contains in its last column an AUGUSTUS transcript
# descriptor with the following format:
# transcript_id "g(\d*).t(\d*)
#
# In a gene gtf file, only the features CDS and intron are used.
# In a six-frame translation gtf file, the feature frame is used.
#
# In order to avoid "double multiplicity" when working with peptide
# libraries that were generated by searching e.g. alternative transcript
# gene sets (redundant), you have to supply a non redundant fasta file of the peptides that
# contains a header in the following format:
# >pepSomeNo mult=1
# (tab separated, some number after mult=, linebreak)
#
# Example call:
# perl peptides2hints.pl tiny.psl tiny.gtf E
#
# A six-frame fasta file produced with getorf can be converted to gtf format
# with the following command:
# cat six-frame.fa | perl -ne 'if(m/^>/){$_=~s/>//; @t1 = split(/_/); print $t1[0]."\tGETORF\tframe\t"; $t1[1]=~m/\d+ \[(\d+) - (\d+)\]/; $staC = $1; $stoC = $2; if($_ =~ m/REVERSE/){print $stoC."\t".$staC."\t.\t-\t";}else{print $staC."\t".$stoC."\t.\t+\t";} @t2 = split(/ /, $_); print "0\t$t2[0]\n";}' > six-frame.gtf
my $usage = "peptides2hints.pl psl-file gff-file fa-file src pri > hint-file\n";
if(@ARGV!=5){print STDERR $usage; exit -1;}
my $pslFile = $ARGV[0];
my $gffFile = $ARGV[1];
my $faFile = $ARGV[2];
my $src = $ARGV[3];
my $pri = $ARGV[4];
my $gffL;
my %gffHash = ();
my $geneName;
my @gffLine = ();
my %multHash = ();
# read multiplicity information
open(MULT, "<", $faFile) or die ("Could not open fasta file $faFile!\n");
my @multT;
while(<MULT>){
chomp;
if($_=~m/^>/){
$_=~s/>//;
@multT = split(/ /);
$multHash{$multT[0]} = $multT[1];
}
}
close(MULT) or die("Could not close fasta file $faFile!\n");
# read gff file into hash of arrays
open(GFF, "<", $gffFile) or die ("Could not open gff-file $gffFile!\n");
while(<GFF>){
chomp;
$gffL = $_;
if(($_=~m/CDS/) or($_=~m/intron/)){
$gffL =~ m/transcript_id "g(\d*).t(\d*)"/;
$geneName = "g$1.t$2";
push (@{$gffHash{$geneName}}, $gffL);
}elsif($_=~m/frame/){
# muss noch genename für six-Frame einlesen implementieren!
@gffLine = split(/\t/, $gffL);
$geneName = $gffLine[8];
push (@{$gffHash{$geneName}}, $gffL);
}
}
close(GFF) or die ("Could not close gff-file $gffFile!\n");
# compute gene lengths and delete genes whose length is not a multiple of 3
my $thisGeneLen = 0;
my @geneGff = ();
for $geneName ( keys %gffHash ) {
$thisGeneLen = 0;
@geneGff = @{$gffHash{$geneName}};
foreach(@{geneGff}){
@gffLine = split(/\t/);
if($gffLine[2]=~m/CDS/){
$thisGeneLen = $thisGeneLen + ($gffLine[4] - $gffLine[3] + 1);
}
}
if(not($thisGeneLen%3 == 0)){
delete $gffHash{$geneName};
}
}
my @pslLine = ();
my $strand;
my $globalPslStart;
my $globalPslStop;
my @tmpPslLine;
my $preLine = "";
my @preOut;
open(PSL, "<", $pslFile) or die("Could not open psl-file $pslFile\n");
# each line of PSL is processed
while(<PSL>){
chomp;
@pslLine = split(/\t/);
# continue working with perfect matches, only
if($pslLine[0] == $pslLine[10]){
$geneName = $pslLine[13];
#print "Processing $geneName\n";
# get gff-entry for gene
if(exists $gffHash{$geneName}){
#print "Found gene $geneName in gffHash\n";
@geneGff = @{$gffHash{$geneName}};
if(@geneGff[0] =~ m/\t\+\t/){
$strand = "+";
}else{
$strand="-";
# convert psl file to the opposite strand
@tmpPslLine = @pslLine;
$pslLine[15] = $pslLine[14] - $tmpPslLine[16] + 1;
$pslLine[16] = $pslLine[14] - $tmpPslLine[15] + 1;
# pslLine[14] = protein length/target length; pslLine[15] = psl start; pslLine[16] = psl stop
}
$exonC = 1;
$intronLen = 0;
$preLine = "";
foreach(@geneGff){
chomp;
@gffLine = split(/\t/);
if($gffLine[2]=~m/CDS/){
if($exonC==1){
$firstExonStart = $gffLine[3]; # $gffLine[3] = gffStart; $gffStop = $gffLine[4];
}
$globalPslStart = $pslLine[15]*3-2 + $firstExonStart - 1 + $intronLen;
$globalPslStop = $pslLine[16]*3 + $firstExonStart -1 + $intronLen;
if($globalPslStart>=$gffLine[3] and $globalPslStart<=$gffLine[4] and $globalPslStop>=$gffLine[3] and $globalPslStop<=$gffLine[4]){
push(@preOut, $gffLine[0]."\tpep2hints\tCDSpart\t".$globalPslStart."\t".$globalPslStop."\t.\t$strand\t0\tsrc=$src;grp=$pslLine[9]");
$printIntron = 0;
}elsif($globalPslStart>=$gffLine[3] and $globalPslStart<=$gffLine[4] and $globalPslStop>=$gffLine[4]){
$preLine = $preLine.$gffLine[0]."\tpep2hints\tCDSpart\t".$globalPslStart."\t".$gffLine[4]."\t.\t$strand\t";
if($strand eq "+"){
$preLine = $preLine."0";
}else{
$preLine = $preLine."$gffLine[7]";
}
$preLine = $preLine."\tsrc=$src;grp=$pslLine[9]";
push(@preOut, $preLine);
$preLine = "";
$printIntron = 1;
}elsif($globaPslStart<=$gffLine[3] and $globalPslStop>=$pslLine[15] and $globalPslStop<=$gffLine[4] and $printIntron==1){
$preLine = $gffLine[0]."\tpep2hints\tCDSpart\t".$gffLine[3]."\t".$globalPslStop."\t.\t$strand\t";
if($strand eq "+"){
$preLine = $preLine."$gffLine[7]";
}else{
$preLine = $preLine."0";
}
$preLine = $preLine."\tsrc=$src;grp=$pslLine[9]";
push(@preOut, $preLine);
$preLine = "";
$printIntron = 0;
}
$exonC++;
}elsif($gffLine[2]=~m/intron/){
$intronLen = $intronLen + ($gffLine[4] -$gffLine[3] + 1);
if($printIntron==1){
push(@preOut, $gffLine[0]."\tpep2hints\tintron\t$gffLine[3]\t$gffLine[4]\t.\t$strand\t.\tsrc=$src;grp=$pslLine[9]");
}
}elsif($gffLine[2]=~m/frame/){
$globalPslStart = $pslLine[15]*3-2 + $gffLine[3] - 1;
$globalPslStop = $pslLine[16]*3 + $gffLine[3] -1;
if($globalPslStart>=$gffLine[3] and $globalPslStart<=$gffLine[4] and $globalPslStop>=$gffLine[3] and $globalPslStop<=$gffLine[4]){
push(@preOut, $gffLine[0]."\tpep2hints\tCDSpart\t".$globalPslStart."\t".$globalPslStop."\t.\t$strand\t0\tsrc=$src;grp=$pslLine[9]");
}
}
}
}
}
}
close(PSL) or die("Could not close psl-file $pslFile\n");
# print "Computed new coordinates\n";
# print hints
my @t3;
foreach(@preOut){
@gffLine = split(/\t/);
@multT = split(/;/, $gffLine[8]);
@t3 = split(/=/,$multT[1]);
print "$gffLine[0]\t$gffLine[1]\t$gffLine[2]\t$gffLine[3]\t$gffLine[4]\t$gffLine[5]\t$gffLine[6]\t$gffLine[7]\tsrc=$src;$multHash{$t3[1]};pri=$pri\n";
}
# print "Printed results\n";
|