This file is indexed.

/usr/share/augustus/scripts/peptides2hints.pl is in augustus 3.3+dfsg-2build1.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#!/usr/bin/perl

# Katharina J. Hoff
# January 2012
#
# Evaluate a psl-file that contains peptide 2 protein mappings 
# in context with a gff-file of the protein coding genes 
# or a six frame translation (gff!) that
# the peptides were mapped against.
#
# Assumptions:
# The gtf file was sorted by startCoord column, e.g.:
# cat in.gtf | sort -n -k4 > out.gtf
# The gtf file contains in its last column an AUGUSTUS transcript
# descriptor with the following format: 
# transcript_id "g(\d*).t(\d*)
#
# In a gene gtf file, only the features CDS and intron are used.
# In a six-frame translation gtf file, the feature frame is used.
#
# In order to avoid "double multiplicity" when working with peptide
# libraries that were generated by searching e.g. alternative transcript
# gene sets (redundant), you have to supply a non redundant fasta file of the peptides that
# contains a header in the following format:
# >pepSomeNo mult=1
# (tab separated, some number after mult=, linebreak)
#
# Example call:
# perl peptides2hints.pl tiny.psl tiny.gtf E
#
# A six-frame fasta file produced with getorf can be converted to gtf format
# with the following command:
# cat six-frame.fa | perl -ne 'if(m/^>/){$_=~s/>//; @t1 = split(/_/); print $t1[0]."\tGETORF\tframe\t"; $t1[1]=~m/\d+ \[(\d+) - (\d+)\]/; $staC = $1; $stoC = $2; if($_ =~ m/REVERSE/){print $stoC."\t".$staC."\t.\t-\t";}else{print $staC."\t".$stoC."\t.\t+\t";} @t2 = split(/ /, $_); print "0\t$t2[0]\n";}' > six-frame.gtf

my $usage = "peptides2hints.pl psl-file gff-file fa-file src pri > hint-file\n";

if(@ARGV!=5){print STDERR $usage; exit -1;}

my $pslFile = $ARGV[0];
my $gffFile = $ARGV[1];
my $faFile = $ARGV[2];
my $src = $ARGV[3];
my $pri = $ARGV[4];

my $gffL;
my %gffHash = ();
my $geneName;
my @gffLine = ();
my %multHash = ();


# read multiplicity information
open(MULT, "<", $faFile) or die ("Could not open fasta file $faFile!\n");
my @multT;
while(<MULT>){
	chomp;
	if($_=~m/^>/){
		$_=~s/>//;
		@multT = split(/ /);
		$multHash{$multT[0]} = $multT[1];
	}
}
close(MULT) or die("Could not close fasta file $faFile!\n");

# read gff file into hash of arrays
open(GFF, "<", $gffFile) or die ("Could not open gff-file $gffFile!\n");
while(<GFF>){
	chomp;
	$gffL = $_;
	if(($_=~m/CDS/) or($_=~m/intron/)){
		$gffL =~ m/transcript_id "g(\d*).t(\d*)"/;
		$geneName = "g$1.t$2";
		push (@{$gffHash{$geneName}}, $gffL);
	}elsif($_=~m/frame/){
		# muss noch genename für six-Frame einlesen implementieren!
		@gffLine = split(/\t/, $gffL);
		$geneName = $gffLine[8];
		push (@{$gffHash{$geneName}}, $gffL);
	}
}
close(GFF) or die ("Could not close gff-file $gffFile!\n");

# compute gene lengths and delete genes whose length is not a multiple of 3
my $thisGeneLen = 0;
my @geneGff = ();
for $geneName ( keys %gffHash ) {
	$thisGeneLen = 0;
	@geneGff = @{$gffHash{$geneName}};
	foreach(@{geneGff}){
		@gffLine = split(/\t/);
		if($gffLine[2]=~m/CDS/){
			$thisGeneLen = $thisGeneLen + ($gffLine[4] - $gffLine[3] + 1);
		}
	}
	if(not($thisGeneLen%3 == 0)){
		delete $gffHash{$geneName};
	}
}

my @pslLine = ();
my $strand;
my $globalPslStart;
my $globalPslStop;
my @tmpPslLine;
my $preLine = "";
my @preOut;
open(PSL, "<", $pslFile) or die("Could not open psl-file $pslFile\n");

# each line of PSL is processed
while(<PSL>){
	chomp;
	@pslLine = split(/\t/);
	# continue working with perfect matches, only
	if($pslLine[0] == $pslLine[10]){
		$geneName = $pslLine[13];
		#print "Processing $geneName\n";
		# get gff-entry for gene
		if(exists $gffHash{$geneName}){
			#print "Found gene $geneName in gffHash\n";
			@geneGff = @{$gffHash{$geneName}};
			if(@geneGff[0] =~ m/\t\+\t/){
				$strand = "+";
			}else{
				$strand="-";
				# convert psl file to the opposite strand
				@tmpPslLine = @pslLine;
				$pslLine[15] = $pslLine[14] - $tmpPslLine[16] + 1;
				$pslLine[16] = $pslLine[14] - $tmpPslLine[15] + 1;
				# pslLine[14] = protein length/target length; pslLine[15] = psl start; pslLine[16] = psl stop
			}
			$exonC = 1;
			$intronLen = 0;
			$preLine = "";
			foreach(@geneGff){
				chomp;
				@gffLine = split(/\t/);
				if($gffLine[2]=~m/CDS/){
					if($exonC==1){
						$firstExonStart = $gffLine[3]; # $gffLine[3] = gffStart; $gffStop = $gffLine[4];
					}
					$globalPslStart = $pslLine[15]*3-2 + $firstExonStart - 1 + $intronLen;
					$globalPslStop = $pslLine[16]*3 + $firstExonStart -1 + $intronLen;
					if($globalPslStart>=$gffLine[3] and $globalPslStart<=$gffLine[4] and $globalPslStop>=$gffLine[3] and $globalPslStop<=$gffLine[4]){
						push(@preOut, $gffLine[0]."\tpep2hints\tCDSpart\t".$globalPslStart."\t".$globalPslStop."\t.\t$strand\t0\tsrc=$src;grp=$pslLine[9]");	
						$printIntron = 0;
					}elsif($globalPslStart>=$gffLine[3] and $globalPslStart<=$gffLine[4] and $globalPslStop>=$gffLine[4]){
						$preLine = $preLine.$gffLine[0]."\tpep2hints\tCDSpart\t".$globalPslStart."\t".$gffLine[4]."\t.\t$strand\t";
						if($strand eq "+"){
							$preLine = $preLine."0";
						}else{
							$preLine = $preLine."$gffLine[7]";
						}
						$preLine = $preLine."\tsrc=$src;grp=$pslLine[9]";
						push(@preOut, $preLine);
						$preLine = "";
						$printIntron = 1;
					}elsif($globaPslStart<=$gffLine[3] and $globalPslStop>=$pslLine[15] and $globalPslStop<=$gffLine[4] and $printIntron==1){
						$preLine = $gffLine[0]."\tpep2hints\tCDSpart\t".$gffLine[3]."\t".$globalPslStop."\t.\t$strand\t";
						if($strand eq "+"){
							$preLine = $preLine."$gffLine[7]";
						}else{
							$preLine = $preLine."0";
						}
						$preLine = $preLine."\tsrc=$src;grp=$pslLine[9]";
						push(@preOut, $preLine);
						$preLine = "";
						$printIntron = 0;
					}
					$exonC++;
				}elsif($gffLine[2]=~m/intron/){
					$intronLen = $intronLen + ($gffLine[4] -$gffLine[3] + 1);
					if($printIntron==1){
						push(@preOut, $gffLine[0]."\tpep2hints\tintron\t$gffLine[3]\t$gffLine[4]\t.\t$strand\t.\tsrc=$src;grp=$pslLine[9]");
					}
				}elsif($gffLine[2]=~m/frame/){
					$globalPslStart = $pslLine[15]*3-2 + $gffLine[3] - 1;
					$globalPslStop = $pslLine[16]*3 + $gffLine[3] -1;
					if($globalPslStart>=$gffLine[3] and $globalPslStart<=$gffLine[4] and $globalPslStop>=$gffLine[3] and $globalPslStop<=$gffLine[4]){	
						push(@preOut, $gffLine[0]."\tpep2hints\tCDSpart\t".$globalPslStart."\t".$globalPslStop."\t.\t$strand\t0\tsrc=$src;grp=$pslLine[9]");
					}
				}
			}
		}
	}
}
close(PSL) or die("Could not close psl-file $pslFile\n");

# print "Computed new coordinates\n";

# print hints
my @t3;
foreach(@preOut){
	@gffLine = split(/\t/);
	@multT = split(/;/, $gffLine[8]);
	@t3 = split(/=/,$multT[1]);
	print "$gffLine[0]\t$gffLine[1]\t$gffLine[2]\t$gffLine[3]\t$gffLine[4]\t$gffLine[5]\t$gffLine[6]\t$gffLine[7]\tsrc=$src;$multHash{$t3[1]};pri=$pri\n";
}
# print "Printed results\n";