/usr/bin/art_profiler_454 is in art-nextgen-simulation-tools 20160605+dfsg-2build1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | #!/usr/bin/perl
##-----------------------------------------------------------------------------
## PROGRAM: art_profiler_454
## VERSION: 0.9.1
## AUTHOR: Weichun Huang (whduke@gmail.com)
## DATE: last updated on 08/09/2012
## DESCRIPTION:
## the program generates empirical 454 read profiles for ART 454 simulator
## from 454 read data files in the fastq format
##-----------------------------------------------------------------------------
use strict;
my $version="0.9.1";
if(@ARGV!=2 and @ARGV!=3){
print "DESCRIPTION:\n\tart_profiler_454 is a program for generating empirical 454 read profiles\n";
print "\tfor ART 454 simulator from 454 read data files\n";
print "\tVersion: $version, Copyright (c) Weichun Huang 2012 (whduke\@gamil.com)\n";
print "USAGE:\n\t$0 input_fastq_files_dir output_profile_dir [fastq_filename_extension]\n";
print "Examples:\n";
print "\t$0 454_dat_dir 454_profile_dir\n";
print "\t$0 454_dat_dir 454_profile_dir fastq\n";
print "NOTES:\n";
print "\t1)the default filename extension for fastq files is fq\n";
print "\t2)the program can read gzipped fastq data files with filename extension fq.gz\n";
exit;
}
my ($inDIR, $outDIR, $ext)=@ARGV;
if(@ARGV==2) { $ext="fq"; }
my %read_length;
my $with_fidx=0;
my (%qdist, %qdiff,%qall);
#foreach my $inDIR(@dirlst){
opendir(DIR, $inDIR) or die "cannot open dir $inDIR\n";
my @allf=readdir(DIR);
my @fs=grep { /\.$ext$/ } @allf;
if(@fs==0){
@fs=grep { /\.$ext\.gz$/ } @allf;
}
foreach my $ff(@fs){
$ff=~/(\S+)\.$ext/;
my $name=$1;
open(FQ, "zcat $inDIR/$ff|") if ($ff=~/\.gz$/);# or die "can't open $inDIR/$ff\n";
open(FQ, "$inDIR/$ff") if ($ff=~/\.$ext$/);# or die "can't open $inDIR/$ff\n";
my $fidx=$ff; $fidx=~s/\.gz$//;$fidx=~s/\.$ext$/\.idx/;
open(FIDX, "$inDIR/$fidx") or die "can't open $inDIR/$fidx\n" if($with_fidx);
while (<FQ>) {
chomp;
my ($id, $ss, $qq, $len);
if (/^\@(\S+)/) {
$id = $1;
$_= <FQ>;
$_=~s/\s+$//;
$len = length;
$read_length{$len}+=1;
$ss = $_;
$_ = <FQ>;
$_ = <FQ>;
$_=~s/\s+$//;
my $qlen = length;
die("The number of bases != the number of quals.") if($len!=$qlen);
my @base=split //,$ss;
my @qv;
my $i=0;
while (/(\S)/g) {
push @qv,ord($1)-33;
}
my @idx;
if($with_fidx){
my $fid=<FIDX>; $fid=~/^>(\S+)/; $fid=$1;
die "$fid!=$id not matched " if($fid ne $id);
$fid=<FIDX>; $fid=~s/\s+$//g; @idx=split /\t/,$fid;
}
else{
my $N=$base[0];
my $c=1;
foreach my $n (@base){
if($N eq $n){
push @idx, $c;
}
else{
$N = $n;
$c++;
push @idx, $c;
}
}
}
my $i=0;
while($i<@idx){
my $k=$idx[$i];
my @q;
my $cc=0;
while (($i<@idx) and ($k==$idx[$i])){
$cc+=1;
$i++;
}
$qdist{$cc}{$qv[$i-$cc]}+=1;
$qall{$cc}{0}{$qv[$i-$cc]}+=1;
foreach my $x(1..$cc-1){
my $d=$i-$cc+$x;
my @aa=@qv[$d-1,$d];
$qdiff{$cc}{$x}{$aa[0]}{$aa[1]}+=1;
$qall{$cc}{$x}{$aa[1]}+=1;
}
}
}
}
}
#}
#the qual distribution of first postion for different len
open (QPROFILE, ">$outDIR/qual_1st_profile") or die "Can't open $outDIR/qual_1st_profile\n";
print QPROFILE <<QPROFFORMAT;
##454 read profile for ART 454 simulator
##homopolymer-length dependent quality score distribution of the 1st-base of homopolymer runs
##FORMAT
##lenth_of_homopolymer quality_score_1 quality_score_2 ...
##lenth_of_homopolymer freq_of_score_1 freq_of_score_2 ...
QPROFFORMAT
my $p=0;
foreach my $len (sort {$a<=>$b} keys %qdist){
#ensure no break in length
last if($len!=$p+1); $p=$len;
print QPROFILE $len;
my $count="";
foreach my $q (sort {$a<=>$b} keys %{$qdist{$len}}){
print QPROFILE "\t$q";
$count.=sprintf "\t%d", $qdist{$len}{$q};
}
printf QPROFILE "\n%d%s\n",$len,$count;
}
print QPROFILE "*this is the end of the profile";
close(QPROFILE);
#1st order Markov model of qual distribution for different len
open (QMC, ">$outDIR/qual_mc_profile") or die "Can't open $outDIR/qual_mc_profile\n";
print QMC <<QMCFORMAT;
##454 read profile for ART 454 simulator
##the empirical transition distribution of the homopolymer-length dependent 1st order Markov model of quality scores
##FORMAT
##homopolymer_length previous_base_position previous_base_quality_score next_base_quality_score_1 next_base_quality_score_2 ...
##homopolymer_length previous_base_position previous_base_quality_score freq_next_base_qual_score_1 freq_next_base_qual_score_2 ...
QMCFORMAT
foreach my $len (sort {$a<=>$b} keys %qdiff){
#ensure no break in length
last if($len>$p);
foreach my $x (sort {$a<=>$b} keys %{$qdiff{$len}}){
foreach my $q (sort {$a<=>$b} keys %{$qdiff{$len}{$x}}){
print QMC "$len\t$x\t$q";
my $count="";
foreach my $q2 (sort {$a<=>$b} keys %{$qdiff{$len}{$x}{$q}}){
print QMC "\t$q2";
$count.=sprintf "\t%d", $qdiff{$len}{$x}{$q}{$q2};
}
printf QMC "\n%d\t%d\t%d%s\n",$len,$x,$q,$count;
}
}
}
print QMC "*this is the end of the profile";
close(QMC);
#empirical distribution of read length
open (FLEN, ">$outDIR/length_dist") or die "Can't open file $outDIR/length_dist\n";
print FLEN <<FLENFORMAT;
##454 read profile for ART 454 simulator
##the empirical distribution of 454 read length
##FORMAT
##leng_1 leng_2 leng_3 ...
##freq_1 freq_2 freq_3 ...
FLENFORMAT
my $cc;
my $i=0;
foreach my $L (sort {$a<=>$b} keys %read_length){
if($i==0){
print FLEN $L;
$cc=$read_length{$L};
$i=1;
next;
}
print FLEN "\t$L";
$cc.="\t".$read_length{$L};
}
print FLEN "\n$cc\n";
print FLEN "*this is the end of the profile";
close(FLEN);
|