This file is indexed.

/usr/bin/art_profiler_454 is in art-nextgen-simulation-tools 20160605+dfsg-2.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/perl
##-----------------------------------------------------------------------------
## PROGRAM: art_profiler_454
## VERSION: 0.9.1
## AUTHOR: Weichun Huang (whduke@gmail.com)
## DATE: last updated on  08/09/2012
## DESCRIPTION:
##	the program generates empirical 454 read profiles for ART 454 simulator
##	from 454 read data files in the fastq format 
##-----------------------------------------------------------------------------

use strict;
my $version="0.9.1";
if(@ARGV!=2 and @ARGV!=3){
       	print "DESCRIPTION:\n\tart_profiler_454 is a program for generating empirical 454 read profiles\n";
	print "\tfor ART 454 simulator from 454 read data files\n";
       	print "\tVersion: $version, Copyright (c) Weichun Huang 2012 (whduke\@gamil.com)\n"; 
	print "USAGE:\n\t$0 input_fastq_files_dir output_profile_dir [fastq_filename_extension]\n";
       	print "Examples:\n";
       	print "\t$0 454_dat_dir 454_profile_dir\n";
       	print "\t$0 454_dat_dir 454_profile_dir fastq\n";
       	print "NOTES:\n";
       	print "\t1)the default filename extension for fastq files is fq\n";
       	print "\t2)the program can read gzipped fastq data files with filename extension fq.gz\n";
       	exit;
}
my ($inDIR, $outDIR, $ext)=@ARGV;
if(@ARGV==2) { $ext="fq"; }

my %read_length;
my $with_fidx=0;
my (%qdist, %qdiff,%qall);

#foreach my $inDIR(@dirlst){
    opendir(DIR, $inDIR) or die "cannot open dir $inDIR\n";
    my @allf=readdir(DIR);
    my @fs=grep { /\.$ext$/ } @allf;
    if(@fs==0){
      @fs=grep { /\.$ext\.gz$/ } @allf;
    }
    foreach my $ff(@fs){
       $ff=~/(\S+)\.$ext/;
       my $name=$1; 

       open(FQ, "zcat  $inDIR/$ff|") if ($ff=~/\.gz$/);# or die "can't open $inDIR/$ff\n";
       open(FQ, "$inDIR/$ff") if ($ff=~/\.$ext$/);# or die "can't open $inDIR/$ff\n";
       my $fidx=$ff; $fidx=~s/\.gz$//;$fidx=~s/\.$ext$/\.idx/;
       open(FIDX, "$inDIR/$fidx") or die "can't open $inDIR/$fidx\n" if($with_fidx);

       while (<FQ>) {
           chomp;
	   my ($id, $ss, $qq, $len);
	   if (/^\@(\S+)/) {
	       $id = $1;
	       $_= <FQ>;
	       $_=~s/\s+$//;
	       $len = length;
	       $read_length{$len}+=1;
	       $ss = $_;

	       $_ = <FQ>;
	       $_ = <FQ>;
	       $_=~s/\s+$//;
	       my $qlen = length;
	       die("The number of bases != the number of quals.") if($len!=$qlen);
	       my @base=split //,$ss;
               my @qv;
	       my $i=0;
	       while (/(\S)/g) {
	          push @qv,ord($1)-33;
	       }
	       my @idx;
	       if($with_fidx){
		       my $fid=<FIDX>; $fid=~/^>(\S+)/; $fid=$1;
		       die "$fid!=$id not matched " if($fid ne  $id); 
		       $fid=<FIDX>; $fid=~s/\s+$//g; @idx=split /\t/,$fid;
	       }
	       else{
		       my $N=$base[0];
		       my $c=1;
		       foreach my $n (@base){
			       if($N eq $n){
				       push @idx, $c; 
			       }
			       else{
				       $N = $n;
				       $c++;
				       push @idx, $c; 
			       }
		       }
	       }
	       my $i=0;
	       while($i<@idx){
		       my $k=$idx[$i];
		       my @q;
		       my $cc=0;
		       while (($i<@idx) and ($k==$idx[$i])){
			       $cc+=1;
			       $i++;
		       }
		       $qdist{$cc}{$qv[$i-$cc]}+=1;
		       $qall{$cc}{0}{$qv[$i-$cc]}+=1;
		       foreach my $x(1..$cc-1){
			       my $d=$i-$cc+$x;
			       my @aa=@qv[$d-1,$d];
			       $qdiff{$cc}{$x}{$aa[0]}{$aa[1]}+=1;
			       $qall{$cc}{$x}{$aa[1]}+=1;
		       }
	       }
           }
       }
    }
#}

#the qual distribution of first postion for different len 
open (QPROFILE, ">$outDIR/qual_1st_profile") or die "Can't open $outDIR/qual_1st_profile\n";
print QPROFILE <<QPROFFORMAT;
##454 read profile for ART 454 simulator
##homopolymer-length dependent quality score distribution of the 1st-base of homopolymer runs 
##FORMAT
##lenth_of_homopolymer	quality_score_1	quality_score_2 ... 
##lenth_of_homopolymer	freq_of_score_1	freq_of_score_2 ...
QPROFFORMAT
my $p=0;
foreach my $len (sort {$a<=>$b} keys %qdist){
	#ensure no break in length
       	last if($len!=$p+1); $p=$len;
       	print QPROFILE $len;
       	my $count="";
       	foreach my $q (sort {$a<=>$b} keys %{$qdist{$len}}){
	       	print QPROFILE "\t$q";
	       	$count.=sprintf "\t%d", $qdist{$len}{$q};
       	}
       	printf QPROFILE "\n%d%s\n",$len,$count;
}
print QPROFILE "*this is the end of the profile";
close(QPROFILE);

#1st order Markov model of qual distribution for different len 
open (QMC, ">$outDIR/qual_mc_profile") or die "Can't open $outDIR/qual_mc_profile\n";
print QMC <<QMCFORMAT;
##454 read profile for ART 454 simulator
##the empirical transition distribution of the homopolymer-length dependent 1st order Markov model of quality scores
##FORMAT
##homopolymer_length	previous_base_position	previous_base_quality_score	next_base_quality_score_1	next_base_quality_score_2	...
##homopolymer_length	previous_base_position	previous_base_quality_score	freq_next_base_qual_score_1	freq_next_base_qual_score_2	...
QMCFORMAT
foreach my $len (sort {$a<=>$b} keys %qdiff){
	#ensure no break in length
       	last if($len>$p); 
  foreach my $x (sort {$a<=>$b} keys %{$qdiff{$len}}){
    foreach my $q (sort {$a<=>$b} keys %{$qdiff{$len}{$x}}){
      print QMC "$len\t$x\t$q";
      my $count="";
        foreach my $q2 (sort {$a<=>$b} keys %{$qdiff{$len}{$x}{$q}}){
          print QMC "\t$q2";
          $count.=sprintf "\t%d", $qdiff{$len}{$x}{$q}{$q2};
        }
      printf QMC "\n%d\t%d\t%d%s\n",$len,$x,$q,$count;
    }
  }
}
print QMC "*this is the end of the profile";
close(QMC);

#empirical distribution of read length  
open (FLEN, ">$outDIR/length_dist") or die "Can't open file $outDIR/length_dist\n";
print FLEN <<FLENFORMAT;
##454 read profile for ART 454 simulator
##the empirical distribution of 454 read length 
##FORMAT
##leng_1	leng_2	leng_3	...
##freq_1	freq_2	freq_3	...
FLENFORMAT
my $cc;
my $i=0;
foreach my $L (sort {$a<=>$b} keys %read_length){
       	if($i==0){
	       	print FLEN $L;
	       	$cc=$read_length{$L};
	       	$i=1;
	       	next;
       	}
       	print FLEN "\t$L";
       	$cc.="\t".$read_length{$L};
}
print FLEN "\n$cc\n";
print FLEN "*this is the end of the profile";
close(FLEN);