/usr/bin/fasta_clipping_histogram.pl is in fastx-toolkit 0.0.14-1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | #!/usr/bin/perl
# FASTX-toolkit - FASTA/FASTQ preprocessing tools.
# Copyright (C) 2009-2013 A. Gordon (assafgordon@gmail.com)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
use strict;
use warnings;
use GD::Graph::bars;
use Data::Dumper;
use PerlIO::gzip;
if (scalar @ARGV==0) {
print<<END;
Create a Linker Clipping Information Histogram
usage: $0 INPUT_FILE.FA OUTPUT_FILE.PNG
INPUT_FILE.FA = input file (in FASTA format, can be GZIPped)
OUTPUT_FILE.PNG = histogram image
END
exit 0;
}
#
# Read parameters
#
open(IN, "<:gzip(autopop)", "$ARGV[0]") or die "Cannot open input file $ARGV[0]\n";
open(OUT, ">$ARGV[1]") or die "Cannot create output file $ARGV[1]\n";
binmode OUT;
my %histogram ;
while (my $name = <IN>) {
my $sequence = <IN> ;
chomp $sequence;
my $sequence_length = length($sequence);
my $count;
if ( index($name, "-")==-1 ) {
#Assume this file is not collapsed, just count each seqeunce as 1
$count = 1 ;
} else {
#Assume file is collapsed (that is - sequence-ID has two numbers with a separating dash)
($count) = $name =~ /^\>[^-]+\-(\d+)$/ ;
# If the match failed, treat this fasta as not collapsed;
$count = 1 if not defined $count ;
}
$histogram{$sequence_length} += $count ;
}
#Textual Output
if (0) {
print "Length\tCount\n";
foreach my $length_key ( sort { $a <=> $b } keys %histogram ) {
print $length_key,"\t", $histogram{$length_key},"\n";
}
exit 0;
}
## Build the data as required by GD::Graph::bars.
## Data list has two items (each item is itself a list)
## 1. a list of x-axis labels (these are the keys from the histogram)
## 2. a list of values
my @data = (
[ sort { $a <=> $b } keys %histogram ],
[ map { $histogram{$_} } sort { $a <=> $b } keys %histogram ] ) ;
my $graph = new GD::Graph::bars (1000,800);
$graph->set(
x_label => 'Length',
y_label => 'Amount',
title => 'Sequences lengths Distribution (after clipping)',
bar_spacing => 10,
transparent => 0,
t_margin => 10,
y_tick_number => 20,
y_long_ticks => 1,
) or die $graph->error;
$graph->plot(\@data) or die $graph->error;
print OUT $graph->gd->png;
close IN;
close OUT;
|