/usr/share/art-nextgen-simulation-tools/ART_profiler

#!/usr/bin/perl
# A perl program to create an empirical distribution from a frequency file
# generated by the summation.pl program and modified by the combinedAvg.pl 
# program.
# 
# @author Jason Myers
# Date: July 2, 2010


use strict;

# Print an error message unless the atleast two fastqReadAvg files and an output file are specified
my $numArgs = $#ARGV + 1;

unless( $numArgs == 2){
    print "Usage: perl empDist.pl freqFile outputFile \n";
    exit;
}

# Possible nucleotidees
my @NUCLEO = ('.', 'A', 'T', 'G', 'C', 'N');

# Arrays specific to each nucleotide
my @DATA_ALL = ();
my @DATA_A = ();
my @DATA_T = ();
my @DATA_G = ();
my @DATA_C = ();
my @DATA_N = ();

# Total variables associated to each nucleotide 
my $allTot = 0;
my $aTot = 0;
my $tTot = 0;
my $gTot = 0;
my $cTot = 0;
my $nTot = 0;

# Open the file in question
my $infile = $ARGV[0];

open INFILE, "$infile", or die $!;

my $count = 0;

# An array to temporarily store each files frequency lines
my @TEMP = ();
	
# A variable to hold the initial grab of the current line
my $line = '';

# variable corresponding to the current length
my $curLength;

# all possible quality scores
my @SCORES = ();

for(my $t = 0; $t < 71; $t++){
	$SCORES[$t] = $t;
}


# Set the line counter to 0
# Loop over each line of the file
while(<INFILE>){

	if( $count >= 1 && $count <= 72 ){
		# get the line
		$line = $_;

		# split the line up into its elements
		@TEMP = split('\t', $line);

		# determine the current length being dealt with
		$curLength = $#TEMP / 5;
		
		# which nucleotide is being dealt with
		my $flag = 0;
		# which position of the read is being dealt with
		my $position = 0;

		# loop over the current line
		foreach my $pos(0 .. $#TEMP){

			# Distribute the frequency values to their respective arrays
			if($flag == 0){
				$DATA_ALL[$count - 1][$position] += $TEMP[$pos];	
				$DATA_A[$count - 1][$position] = $TEMP[$pos];	
			} elsif($flag == 1){
				$DATA_ALL[$count - 1][$position] += $TEMP[$pos];	
				$DATA_T[$count - 1][$position] = $TEMP[$pos];	
			} elsif($flag == 2){
				$DATA_ALL[$count - 1][$position] += $TEMP[$pos];	
				$DATA_G[$count - 1][$position] = $TEMP[$pos];	
			} elsif($flag == 3){
				$DATA_ALL[$count - 1][$position] += $TEMP[$pos];	
				$DATA_C[$count - 1][$position] = $TEMP[$pos];	
			} elsif($flag == 4){
				$DATA_ALL[$count - 1][$position] += $TEMP[$pos];	
				$DATA_N[$count - 1][$position] = $TEMP[$pos];	
			}

			$position++;

			# if the position being dealt with is the length of the read 
			if($position == $curLength){
				# move to the next possible nucleotide
				$flag++;

				# reset the position to 0
				$position = 0;
			}
		}

		# set the temp variable back to null
		@TEMP = ();
		}

		# increase the line counter
		$count++;
}

# close the current infile
close(INFILE);

# open the outfile
my $outfile = $ARGV[$numArgs - 1];

open OUTFILE, ">>$outfile", or die $!;

# loop over the possible nucleotides
for(my $nuc = 0; $nuc < 6; $nuc++){
	#loop over the possible positions 
	for(my $pos = 0; $pos < $curLength; $pos++){
		
		print OUTFILE $NUCLEO[$nuc], "\t", $pos, "\t";
		# loop over the possible quality scores
		for(my $qual = 0; $qual < 71; $qual++){
			
			# determine the quality scores that have non-zero frequencies
			if($nuc == 0){
				unless($DATA_ALL[$qual][$pos] == 0){
						print OUTFILE $SCORES[$qual], "\t";
				}
			} elsif($nuc == 1){
				unless($DATA_A[$qual][$pos] == 0){
						print OUTFILE $SCORES[$qual], "\t";
				}
			} elsif($nuc == 2){
				unless($DATA_T[$qual][$pos] == 0){
						print OUTFILE $SCORES[$qual], "\t";
				}
			} elsif($nuc == 3){
				unless($DATA_G[$qual][$pos] == 0){
						print OUTFILE $SCORES[$qual], "\t";
				}
			} elsif($nuc == 4){
				unless($DATA_C[$qual][$pos] == 0){
						print OUTFILE $SCORES[$qual], "\t";
				}
			} elsif($nuc == 5){
				unless($DATA_N[$qual][$pos] == 0){
						print OUTFILE $SCORES[$qual], "\t";
				}
			}
		}

		print OUTFILE "\n", $NUCLEO[$nuc], "\t", $pos, "\t";
	
		# loop over the quality scores again
		for(my $qual = 0; $qual < 71; $qual++){
	
			# display the cumulative frequency for each quality score
			if($nuc == 0){
				unless($DATA_ALL[$qual][$pos] == 0){
						$allTot += $DATA_ALL[$qual][$pos];
						print OUTFILE $allTot, "\t";
				}
			} elsif($nuc == 1){
				unless($DATA_A[$qual][$pos] == 0){
						$aTot += $DATA_A[$qual][$pos];
						print OUTFILE $aTot, "\t";
				}
			} elsif($nuc == 2){
				unless($DATA_T[$qual][$pos] == 0){
						$tTot += $DATA_T[$qual][$pos];
						print OUTFILE $tTot, "\t";
				}
			} elsif($nuc == 3){
				unless($DATA_G[$qual][$pos] == 0){
						$gTot += $DATA_G[$qual][$pos];
						print OUTFILE $gTot, "\t";
				}
			} elsif($nuc == 4){
				unless($DATA_C[$qual][$pos] == 0){
						$cTot += $DATA_C[$qual][$pos];
						print OUTFILE $cTot, "\t";
				}
			} elsif($nuc == 5){
				unless($DATA_N[$qual][$pos] == 0){
						$nTot += $DATA_N[$qual][$pos];
						print OUTFILE $nTot, "\t";
				}
			}

		}

		print OUTFILE "\n";

		# re-set all totals to 0
		$allTot = 0;
		$aTot = 0;
		$tTot = 0;
		$gTot = 0;
		$cTot = 0;
		$nTot = 0;
	}
}

# close the output file
close(OUTFILE);

# end  summation.pl
exit;
art-nextgen-simulation-tools-profiles 20160605+dfsg-2build1 / usr / share / art-nextgen-simulation-tools / ART_profiler_illumina / empDist.pl