/usr/share/perl5/Genome/Model/Tools/Music/Proximity.pm is in libgenome-model-tools-music-perl 0.04-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 | package Genome::Model::Tools::Music::Proximity;
use warnings;
use strict;
use IO::File;
our $VERSION = $Genome::Model::Tools::Music::VERSION;
class Genome::Model::Tools::Music::Proximity {
is => 'Command::V2',
has_input => [
maf_file => { is => 'Text', doc => "List of mutations using TCGA MAF specifications v2.3" },
output_dir => { is => 'Text', doc => "Directory where output files will be written" },
max_proximity => { is => 'Text', doc => "Maximum allowed AA distance between 2 mutations", is_optional => 1, default => 7 },
skip_non_coding => { is => 'Boolean', doc => "Skip non-coding mutations from the provided MAF file", is_optional => 1, default => 1 },
skip_silent => { is => 'Boolean', doc => "Skip silent mutations from the provided MAF file", is_optional => 1, default => 1 },
],
has_output => [
output_file => {is => 'Text', doc => "TODO"},
],
doc => "Perform a proximity analysis on a list of mutations."
};
sub help_detail {
return <<HELP
This module first calculates the amino acid position of each mutation in the MAF file within its
respective transcript. Then, for each mutation, two values are calculated: 1) the number of other
mutations on the same transcript within the proximity limit set by the max-proximity input
parameter, and 2) the distance to the closest other mutation in this nearby set. Only mutations
which have another mutation within close proximity are reported in the output-file.
In addition to the standard version 2.3 MAF headers, there needs to be 3 columns appended. These
column headers in the MAF must have these names in the header in order for the tool to find them:
transcript_name - the transcript name, such as NM_000028
amino_acid_change - the amino acid change, such as p.R290H
c_position - the nucleotide position changed, such as c.869
The output is generated with the folowing column headers:
Mutations_Within_Proximity, Nearest_Mutation, Gene, Transcript, Affected_Amino_Acid(s), Chr,
Start, Stop, Ref_Allele, Var_Allele, Sample
HELP
}
sub help_synopsis {
return <<EOS
... music proximity \\
--maf-file input_dir/myMAF.tsv \\
--output-dir output_dir/ \\
--max-proximity 15
EOS
}
sub _doc_authors {
return <<EOS
Nathan D. Dees, Ph.D.
Dan Koboldt, M.S.
Cyriac Kandoth, Ph.D.
EOS
}
sub execute {
my $self = shift;
my $maf_file = $self->maf_file;
my $output_dir = $self->output_dir;
my $max_proximity = $self->max_proximity;
my $skip_non_coding = $self->skip_non_coding;
my $skip_silent = $self->skip_silent;
$output_dir =~ s/(\/)+$//; # Remove trailing forward slashes if any
# Check on all the input data before starting work
print STDERR "MAF file not found or is empty: $maf_file\n" unless( -s $maf_file );
print STDERR "Output directory not found: $output_dir\n" unless( -e $output_dir );
return undef unless( -s $maf_file && -e $output_dir );
# Output of this script will be written to this location in the output directory
my $out_file = "$output_dir/proximity_report";
$self->output_file($out_file);
# Parse the header row in the MAF file
my $maf_fh = IO::File->new( $maf_file ) or die "Couldn't open $maf_file. $!";
my $maf_header = $maf_fh->getline;
$maf_header = $maf_fh->getline while( $maf_header =~ /^#/ ); # Skip commented lines
chomp( $maf_header );
unless( $maf_header =~ /^Hugo_Symbol/ )
{
print STDERR "Could not find column headers in $maf_file\n";
return undef;
}
# Check whether the required additional MAF columns were included in the MAF
unless( $maf_header =~ m/c_position/ and $maf_header =~ m/amino_acid_change/ and $maf_header =~ m/transcript_name/ )
{
print STDERR "Could not find required additional columns in $maf_file\n";
return undef;
}
# Find the indexes of all the MAF columns
my $idx = 0;
my %col_idx = map {($_, $idx++)} split( /\t/, $maf_header );
# A hash to store statuses, and a hash to store variants and AA positions
my %status;
my $status = \%status;
my %aa_mutations;
# Load relevant data from MAF into hash
while( my $line = $maf_fh->getline )
{
chomp $line;
my @cols = split( /\t/, $line );
# Fetch data from the generic MAF columns
my ( $gene, $chr, $start, $stop, $mutation_class, $mutation_type, $ref_allele, $var_allele, $var2, $sample ) =
( $cols[0], $cols[4], $cols[5], $cols[6], $cols[8], $cols[9], $cols[10], $cols[11], $cols[12], $cols[15] );
$var_allele = $var2 if ( $var_allele eq $ref_allele ); # Different centers interpret the 2 variant columns differently
# Fetch data from the required additional MAF columns
my ( $c_position, $aa_change, $transcript ) = ( $cols[$col_idx{c_position}], $cols[$col_idx{amino_acid_change}], $cols[$col_idx{transcript_name}] );
# Create a key to uniquely identify each variant
my $variant_key = join( "\t", $gene, $chr, $start, $stop, $ref_allele, $var_allele, $sample );
#check that the mutation class is acceptable
if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|In_Frame_Ins|Silent|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ )
{
print STDERR "Unrecognized Variant_Classification \"$mutation_class\" in MAF file for gene $gene\n";
print STDERR "Please use TCGA MAF Specification v2.3.\n";
return undef;
}
# If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region
if(( $skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) ||
( $skip_silent && $mutation_class =~ m/^Silent$/ ))
{
print "Skipping $mutation_class mutation in gene $gene.\n";
$status{synonymous_mutations_skipped}++;
next;
}
# Determine amino acid position and load into hash
my @mutated_aa_positions = ();
@mutated_aa_positions = $self->get_amino_acid_pos( $variant_key, $mutation_class, $c_position, $aa_change, $status );
# Record data in hash if mutated aa position found
if( scalar( @mutated_aa_positions ) > 0 )
{
push( @{$aa_mutations{$transcript}{$variant_key}{mut_AAs}}, @mutated_aa_positions );
}
}
$maf_fh->close;
# Evaluate proximity of mutated amino acids for each transcript
for my $transcript ( keys %aa_mutations )
{
# For each variant hitting that transcript
for my $variant ( keys %{$aa_mutations{$transcript}} )
{
# Initialize the search
my @affected_amino_acids = @{$aa_mutations{$transcript}{$variant}{mut_AAs}};
my $mutations_within_proximity = 0; # Variable for summing # of mutations within proximity
my $min_proximity = $max_proximity + 1; # Current minimum proximity
# For each OTHER variant hitting the transcript
for my $other_variant ( keys %{$aa_mutations{$transcript}} )
{
# Ignore the current mutation
next if $variant eq $other_variant;
# Get affected amino acids from OTHER variant
my @other_affected_amino_acids = @{$aa_mutations{$transcript}{$other_variant}{mut_AAs}};
# Compare distances between amino acids
my $found_close_one = 0;
for my $other_variant_aa ( @other_affected_amino_acids )
{
for my $variant_aa ( @affected_amino_acids )
{
my $distance = abs($other_variant_aa - $variant_aa);
# If distance is within range
if( $distance <= $max_proximity )
{
$found_close_one++;
$min_proximity = $distance if $distance < $min_proximity;
}
}
}
# Note that this variant is within proximity if applicable
$mutations_within_proximity++ if $found_close_one;
}
# Now, save results in hash if there are any
if ($mutations_within_proximity)
{
$aa_mutations{$transcript}{$variant}{muts_within_range} = $mutations_within_proximity;
$aa_mutations{$transcript}{$variant}{min_proximity} = $min_proximity;
}
}
}
# Print results
my $out_fh = IO::File->new( $out_file, ">" ) or die "Couldn't open $out_file. $!";
$out_fh->print( "Mutations_Within_Proximity\tNearest_Mutation\tGene\tTranscript\tAffected_Amino_Acid(s)\tChr\tStart\tStop\tRef_Allele\tVar_Allele\tSample\n" );
for my $transcript ( keys %aa_mutations )
{
for my $variant ( keys %{$aa_mutations{$transcript}} )
{
if( exists $aa_mutations{$transcript}{$variant}{muts_within_range} )
{
my ( $gene, $chr, $start, $stop, $ref_allele, $var_allele, $sample ) = split( /\t/, $variant );
my $affected_amino_acids = join( ",", sort @{$aa_mutations{$transcript}{$variant}{mut_AAs}} );
my $line = join( "\t", $aa_mutations{$transcript}{$variant}{muts_within_range},
$aa_mutations{$transcript}{$variant}{min_proximity}, $gene, $transcript,
$affected_amino_acids, $chr, $start, $stop, $ref_allele, $var_allele, $sample );
$out_fh->print( "$line\n" );
}
}
}
$out_fh->close();
return 1;
}
################################################################################
=head2 get_amino_acid_pos
This subroutine deducts the amino acid position within the transcript using the c_position and amino_acid_position columns in the MAF.
=cut
################################################################################
sub get_amino_acid_pos {
# Parse arguments
my $self = shift;
my ( $variant_key, $mut_class, $c_position, $aa_change, $status ) = @_;
# Initialize variables
my $tx_start = my $tx_stop = 0;
my $aa_position_start = my $aa_position_stop = 0;
my $inferred_aa_start = my $inferred_aa_stop = 0;
my $aa_pos = my $inferred_aa_pos = 0;
# Amino acid position determination
if( $aa_change && $aa_change ne "NULL" && substr( $aa_change, 0, 1 ) ne "e" )
{
$aa_pos = $aa_change;
$aa_pos =~ s/[^0-9]//g;
}
# Parse out c_position if applicable ##
if( $c_position && $c_position ne "NULL" )
{
# If multiple results, parse both ##
if( $c_position =~ '_' && !( $mut_class =~ 'splice' ))
{
($tx_start, $tx_stop) = split( /\_/, $c_position );
$tx_start =~ s/[^0-9]//g;
$tx_stop =~ s/[^0-9]//g;
if( $tx_stop < $tx_start )
{
$inferred_aa_start = $tx_stop / 3;
$inferred_aa_start = sprintf( "%d", $inferred_aa_start ) + 1 if( $tx_stop % 3 ) ;
$inferred_aa_stop = $tx_start / 3;
$inferred_aa_stop = sprintf( "%d", $inferred_aa_stop ) + 1 if( $tx_start % 3 );
}
else
{
$inferred_aa_start = $tx_start / 3;
$inferred_aa_start = sprintf( "%d", $inferred_aa_start ) + 1 if( $tx_start % 3 );
$inferred_aa_stop = $tx_stop / 3;
$inferred_aa_stop = sprintf( "%d", $inferred_aa_stop ) + 1 if( $tx_stop % 3 );
}
}
else
{
my ( $tx_pos ) = split( /[\+\-\_]/, $c_position );
$tx_pos =~ s/[^0-9]//g;
$tx_start = $tx_stop = $tx_pos;
if($tx_pos)
{
$inferred_aa_pos = $tx_pos / 3;
$inferred_aa_pos = sprintf( "%d", $inferred_aa_pos ) + 1 if( $tx_pos % 3 );
$inferred_aa_start = $inferred_aa_stop = $inferred_aa_pos;
}
}
}
# If we inferred aa start stop, proceed with it ##
if( $inferred_aa_start && $inferred_aa_stop )
{
$aa_position_start = $inferred_aa_start;
$aa_position_stop = $inferred_aa_stop;
$status->{aa_position_inferred}++;
}
# Otherwise if we inferred aa position ##
elsif( $aa_pos )
{
$aa_position_start = $aa_pos;
$aa_position_stop = $aa_pos;
$status->{c_position_not_available}++;
}
# Otherwise we were unable to infer the info ##
else
{
$status->{aa_position_not_found}++;
$self->status_message( "Amino acid position not found for variant: $variant_key" );
return;
}
# Proceed if we have aa_position_start and stop ##
my %mutated_aa_positions;
if( $aa_position_start && $aa_position_stop )
{
for( my $this_aa_pos = $aa_position_start; $this_aa_pos <= $aa_position_stop; $this_aa_pos++ )
{
$mutated_aa_positions{$this_aa_pos}++;
}
}
my @mutated_aa_positions = keys %mutated_aa_positions;
return @mutated_aa_positions;
}
1;
|