/usr/share/perl5/VcfStats.pm

#
# Author: petr.danecek@sanger
#

=head1 NAME

VcfStats.pm.  Module for collecting stats from VCF files. 

=head1 SYNOPSIS

    use VcfStats;

    my $vstats = VcfStats->new(file=>'example.vcf.gz');
    while (my $x=$vstats->next_data_hash()) 
    {
        $vstats->collect_stats($x);
    }
    $vstats->dump();

=cut

package VcfStats;
use strict;
use warnings;
use Carp;
use Data::Dumper;
use base 'Vcf';

=head2 new

    About   : Creates new VcfStats.
    Usage   : my $vstats = VcfStats->new(file=>'my.vcf');
    Args    : See Vcf.pm

=cut

sub new
{
    my ($class,@args) = @_;
    my $self = $class->SUPER::new(@args);
    for my $version (@{$$self{versions}})
    {
        if ( $self->isa($version) ) { eval "use base '$version'"; }
    }
    bless($self,$class);
    return $self;
}


sub parse_header
{
    my ($self,@args) = @_;
    $self->SUPER::parse_header(@args);
}

=head2 get_stats_key

    About   : Creates relevant stats hash key, used by select_stats
    Usage   : 
    Args [1]: Hash with filter definition (value to match, range, etc.)
         [2]: Prefix of the stat
         [3]: Value of the filter

=cut

sub get_stats_key
{
    my ($self,$filter,$key,$value) = @_;

    my $stat_key;
    if ( $$filter{exact} )
    {
        if ( $value ne $$filter{value} ) { next; }
        $stat_key = $key.'/'.$value;
    }
    elsif ( $value eq '.' ) { $stat_key = $key.'/.'; }
    elsif ( $$filter{any} ) { $stat_key = $key.'/'.$value; }
    elsif ( $$filter{bin} )
    {
        my $bin = int($value/$$filter{bin_size}) * $$filter{bin_size};
        if ( $bin>$$filter{max} ) { $bin=">$$filter{max}"; }
        $stat_key = $key.'/'.$bin;
    }
    else { $self->throw("TODO: $key...\n"); }
    return $stat_key;
}

=head2 select_stats

    About   : Selects relevant stats hashes
    Usage   : 
    Args [1]: Hash record from next_data_hash
         [2]: Filters

=cut

sub select_stats
{
    my ($self,$rec,$filters) = @_;

    if ( !exists($$self{stats}{all}) ) { $$self{stats}{all}={}; }
    my @mandatory = ( $$self{stats}{all} );

    my %samples;
    for my $sample (keys %{$$rec{gtypes}})
    {
        if ( !exists($$self{stats}{samples}{$sample}) ) 
        { 
            $$self{stats}{samples}{$sample} = {}; 
        }
        push @{$samples{$sample}}, $$self{stats}{samples}{$sample};
    }

    if ( !defined $filters ) { return (\@mandatory,\%samples); }

    while (my ($key,$filter) = each %$filters)
    {
        if ( $key eq 'FILTER' ) 
        { 
            for my $value (@{$$rec{FILTER}})
            {
                my $stats_key = $self->get_stats_key($filter,$key,$value);
                if ( !exists($$self{stats}{$stats_key}) ) { $$self{stats}{$stats_key}={}; }
                push @mandatory, $$self{stats}{$stats_key};
            }
        }
        elsif ( $key eq 'QUAL' ) 
        { 
            my $stats_key = $self->get_stats_key($filter,$key,$$rec{QUAL});
            if ( !exists($$self{stats}{$stats_key}) ) { $$self{stats}{$stats_key}={}; }
            push @mandatory, $$self{stats}{$stats_key};
        }
        elsif ( $key=~m{^INFO/} ) 
        { 
            if ( $$filter{is_flag} )
            {
                if ( $$filter{value} && !exists($$rec{INFO}{$$filter{tag}}) ) { next; }
                elsif ( !$$filter{value} && exists($$rec{INFO}{$$filter{tag}}) ) { next; }
                if ( !exists($$self{stats}{$key}) ) { $$self{stats}{$key}={}; }
                push @mandatory, $$self{stats}{$key};
                next;
            }
            elsif ( exists($$rec{INFO}{$$filter{tag}}) )
            {
                my $stats_key = $self->get_stats_key($filter,$key,$$rec{INFO}{$$filter{tag}});
                if ( !exists($$self{stats}{$stats_key}) ) { $$self{stats}{$stats_key}={}; }
                push @mandatory, $$self{stats}{$stats_key};
            }
        }
        elsif ( $key=~m{^FORMAT/([^/]+)$} )
        {
            while (my ($sample,$hash) = each %{$$rec{gtypes}})
            {
                if ( !exists($$hash{$1}) ) { next; }
                my $stats_key = $self->get_stats_key($filter,$1,$$hash{$1});
                if ( !exists($$self{stats}{samples}{$sample}{user}{$stats_key}) ) { $$self{stats}{samples}{$sample}{user}{$stats_key}={}; }
                push @{$samples{$sample}}, $$self{stats}{samples}{$sample}{user}{$stats_key};
            }
        }
        elsif ( $key=~m{^SAMPLE/([^/]+)/([^/]+)$} )
        {
            if ( !exists($$rec{gtypes}{$1}{$2}) ) { next; }
            my $stats_key = $self->get_stats_key($filter,$2,$$rec{gtypes}{$1}{$2});
            if ( !exists($$self{stats}{samples}{$1}{user}{$stats_key}) ) { $$self{stats}{samples}{$1}{user}{$stats_key}={} }
            push @{$samples{$1}}, $$self{stats}{samples}{$1}{user}{$stats_key};
        }
        else { $self->throw("The feature currently not recognised: $key.\n"); } 
    }
    return (\@mandatory,\%samples);
}

=head2 collect_stats

    About   : Collect stats 
    Usage   : my $x=$vstats->next_data_hash(); $vstats->collect_stats($x);
    Args    : 

=cut

sub collect_stats
{
    my ($self,$rec,$filters) = @_;

    # Ts/Tv and custom numbers based on INFO, QUAL etc. for the mandatory columns
    my ($mandatory_stats,$sample_stats) = $self->select_stats($rec,$filters);
    $self->collect_stats_mandatory($rec,$mandatory_stats);

    # Ts/Tv for samples
    while (my ($sample,$stats) = each %$sample_stats)
    {
        $self->collect_stats_sample($rec,$sample,$stats);
    }

    my %type_keys = ( r=>'ref', s=>'snp', i=>'indel' );

    # Private calls and the number of shared SNPs. Check if:
    #   - there is a nonref variant present only in this sample (samples->sample_name->private)
    #   - there is a nonref variant in N samples (samples->all->shared)
    #   - there is a non-empty call (samples->sample_name->count)
    my $shared = 0;
    my $sample_name;
    while (my ($sample,$stats) = each %$sample_stats)
    {
        my ($alleles,$seps,$is_phased,$is_empty) = $self->parse_haplotype($rec,$sample);
        if ( $is_empty ) { next; }

        my $is_hom=1;
        my %types;
        my $is_ref = 1;
        for my $al (@$alleles)
        {
            if ( $$alleles[0] ne $al ) { $is_hom=0; }
            my ($type,$len,$ht) = $self->event_type($rec,$al);
            $types{$type} = 1;
            if ( $type eq 'r' ) { next; }
            $is_ref = 0;
        }
        for my $stat (@$stats) { $$stat{count}++; }

        for my $type (keys %types)
        {
            my $key = exists($type_keys{$type}) ? $type_keys{$type} : 'other';
            $key .= '_count';
            for my $stat (@$stats) { $$stat{$key}++; }
        }
        my $key;
        if ( exists($types{r}) ) 
        { 
            if ( $is_hom ) { $key='hom_RR'; }
            else { $key='het_RA' }
        }
        elsif ( $is_hom ) { $key='hom_AA'; }
        else { $key='het_AA'; }
        $key .= '_count';
        for my $stat (@$stats) { $$stat{$key}++; }

        $key = $is_phased ? 'phased' : 'unphased';
        for my $stat (@$stats) { $$stat{$key}++; }

        if ( $is_ref ) { next; }
        $shared++;
        if ( !defined $sample_name ) { $sample_name = $sample; }
    }
    $$self{stats}{all}{shared}{$shared}++;
    if ( $shared==1 )
    {
        for my $stat (@{$$sample_stats{$sample_name}}) { $$stat{private}++; }
    }
}


=head2 collect_stats_mandatory

    About   : Collect stats based on mandatory columns
    Usage   : my $x=$vstats->next_data_hash(); $vstats->collect_stats_mandatory($x);
    Args    : 

=cut

sub collect_stats_mandatory
{
    my ($self,$rec,$stats) = @_;

    # How many mono,bi,tri-allelic etc sites are there
    my $nalt = 0;
    if ( !scalar keys %{$$rec{gtypes}} ) 
    { 
        $nalt = scalar @{$$rec{ALT}};
        if ( $nalt==1 && $$rec{ALT}[0] eq '.' ) { $nalt=0 }
    }
    elsif ( exists($$rec{INFO}{AC}) )
    {
        for my $ac (split(/,/,$$rec{INFO}{AC})) 
        { 
            if ( $ac ) { $nalt++; } 
        }
    }
    else
    {
        my ($an,$ac,$acs) = $self->calc_an_ac($$rec{gtypes});
        for my $ac (@$acs)
        {
            if ( $ac ) { $nalt++; }
        }
    }

    my %types;
    for my $alt (@{$$rec{ALT}})
    {
        if ( $alt eq '.' ) { $alt=$$rec{REF}; }
        my $type = $self->add_variant($rec,$alt,$stats);
        $types{$type} = 1;
    }


    # Increment counters
    for my $stat (@$stats)
    {
        $$stat{'nalt_'.$nalt}++;
        $$stat{count}++;
        for my $type (keys %types)
        {
            $$stat{$type.'_count'}++;
        }
    }
}


=head2 collect_stats_sample

    About   : Collect stats for given sample
    Usage   : my $x=$vstats->next_data_hash(); $vstats->collect_stats_sample($x,'NA0001');
    Args [1]  hash row from next_data_hash 
         [2]  sample name
         [3]  stats to collect

=cut

sub collect_stats_sample
{
    my ($self,$rec,$sample,$stats) = @_;

    my ($alleles,$seps,$is_phased,$is_empty) = $self->parse_haplotype($rec,$sample);
    if ( @$alleles > 2 ) { $self->throw("FIXME: currently handling diploid data only (easy to fix)\n"); }
    my $prev;
    for my $al (@$alleles)
    {
        if ( !defined $prev or $prev ne $al )
        {
            # Only heterozygous SNPs will be counted twice
            $self->add_variant($rec,$al,$stats);
        }
        $prev = $al;
    }
}


=head2 add_variant

    About   : Register mutation type in the selected pool
    Usage   : $vstats->add_variant('A','AT',$stats);
              $vstats->add_variant($rec,'AT',$stats);
    Args      [1] Reference haplotype or VCF data line parsed by next_data_hash
              [2] Variant haplotype
              [3] Array of hash stats
    Returns : The event type (snp,indel,ref)

=cut

sub add_variant
{
    my ($self,$ref,$alt,$stats) = @_;
    my $key_type = 'other';
    my %key_subt;
    if ( $alt eq '.' )
    {
        $key_type = 'missing';
    }
    else
    {
        my ($type,$len,$ht) = $self->event_type($ref,$alt);
        if ( $type eq 's' ) 
        { 
            $key_type = 'snp';

            # The SNP can be encoded for example as GTTTTTTT>CTTTTTTT
            my $ref_str = ref($ref) eq 'HASH' ? $$ref{REF} : $ref;
            my $ref_len = length($ref_str);
            if ( $ref_len>1 )
            {
                for (my $i=0; $i<$ref_len; $i++)
                {
                    my $ref_nt = substr($ref_str,$i,1);
                    my $alt_nt = substr($alt,$i,1);
                    if ( $ref_nt ne $alt_nt )
                    {
                        $key_subt{$ref_nt.'>'.$alt_nt}++;
                    }
                }
            }
            else
            {
                $key_subt{$ref_str.'>'.$alt}++;
            }
        }
        elsif ( $type eq 'i' ) { $key_type = 'indel'; $key_subt{$len}++; }
        elsif ( $type eq 'r' ) { $key_type = 'ref'; }
    }
    for my $stat (@$stats)
    {
        if ( %key_subt )
        {
            while (my ($subt,$value)=each %key_subt)
            {
                $$stat{$key_type}{$subt}+=$value;
            }
        }
        else
        {
            $$stat{$key_type}++;
        }
    }
    return $key_type;
}


=head2 dump

    About   : Produce Data::Dumper dump of the collected stats
    Usage   : 
    Args    :
    Returns : The dump.

=cut

sub dump
{
    my ($self) = @_;
    return Dumper($$self{stats});
}


sub _calc_tstv
{
    my ($self,$stat) = @_;

    my $ts = 0;
    for my $mut (qw(A>G G>A C>T T>C))
    {
        if ( exists($$stat{$mut}) ) { $ts += $$stat{$mut}; }
    }
    my $tv = 0;
    for my $mut (qw(A>C C>A G>T T>G A>T T>A C>G G>C))
    {
        if ( exists($$stat{$mut}) ) { $tv += $$stat{$mut}; }
    }
    my $ratio = $tv ? $ts/$tv : 0;
    return ($ts,$tv,$ratio);
}


=head2 dump_tstv

    About   : Calculate transitions/transversions ratio and output string
    Usage   : 
    Args    :
    Returns : Formatted string

=cut

sub dump_tstv
{
    my ($self,$stats) = @_;
    my $out = "#Transitions\tTransversions\tts/tv\tSample\n";
    for my $key (sort keys %$stats)
    {
        if ( !exists($$stats{$key}{snp}) ) { next; }
        my $stat = $$stats{$key}{snp};
        my ($ts,$tv,$ratio) = $self->_calc_tstv($stat);
        $out .= sprintf "%d\t%d\t%.2f\t%s\n", $ts,$tv,$ratio,$key;
    }
    return $out;
}


=head2 dump_qual_tstv

    About   : Calculate marginal transitions/transversions ratios for QUAL/* stats
    Usage   : 
    Args    :
    Returns : Formatted string

=cut

sub dump_qual_tstv
{
    my ($self,$file) = @_;
    my @values;
    for my $stat (keys %{$$self{stats}})
    {
        if ( !($stat=~m{^QUAL/(.+)}) ) { next; }
        my $qual  = $1;
        # The quality record can be also of the form ">200". Exclude these from numeric comparison
        if ( !($qual=~/^[0-9.]+$/) ) { $qual = "#$qual"; }
        my $count = $$self{stats}{$stat}{count};
        if ( !exists($$self{stats}{$stat}{snp}) ) { next; }
        my ($ts,$tv,$ratio) = $self->_calc_tstv($$self{stats}{$stat}{snp});
        push @values, [$qual,$count,$ratio];
    }
    my @svalues = sort { if ($$a[0]=~/^#/ or $$b[0]=~/^#/) { return $$a[0] cmp $$b[0]; } return $$a[0] <=> $$b[0]; } @values;
    my $out = "#Quality\tMarginal count\tMarginal Ts/Tv\n";
    for my $val (@svalues)
    {
        if ( $$val[0]=~/^#/ )
        {
            $out .= sprintf "%s\t%d\t%.2f\n", $$val[0],$$val[1],$$val[2];
        }
        else
        {
            $out .= sprintf "%.2f\t%d\t%.2f\n", $$val[0],$$val[1],$$val[2];
        }
    }
    return $out;
}


=head2 dump_counts

    About   : 
    Usage   : 
    Args    :
    Returns : Formatted string

=cut

sub dump_counts
{
    my ($self) = @_;
    my $out = "#Count\tFilter\n";
    for my $key (sort keys %{$$self{stats}})
    {
        if ( !exists($$self{stats}{$key}{count}) ) { next; }
        $out .= sprintf "%d\t%s\n", $$self{stats}{$key}{count},$key;
    }
    for my $key (sort keys %{$$self{stats}{samples}})
    {
        if ( !exists($$self{stats}{samples}{$key}{count}) ) { next; }
        $out .= sprintf "%d\tsamples/%s\n", $$self{stats}{samples}{$key}{count},$key;
    }
    return $out;
}

sub dump_snp_counts
{
    my ($self) = @_;
    my $out = "#Count\tFilter\n";
    for my $key (sort keys %{$$self{stats}})
    {
        if ( !exists($$self{stats}{$key}{snp_count}) ) { next; }
        $out .= sprintf "%d\t%s\n",$$self{stats}{$key}{snp_count},$key;
    }
    for my $key (sort keys %{$$self{stats}{samples}})
    {
        if ( !exists($$self{stats}{samples}{$key}{snp_count}) ) { next; }
        $out .= sprintf "%d\tsamples/%s\n", $$self{stats}{samples}{$key}{snp_count},$key;
    }
    return $out;
}

sub dump_indel_counts
{
    my ($self) = @_;
    my $out = "#Count\tFilter\n";
    for my $key (sort keys %{$$self{stats}})
    {
        if ( !exists($$self{stats}{$key}{indel_count}) ) { next; }
        $out .= sprintf "%d\t%s\n",$$self{stats}{$key}{indel_count},$key;
    }
    for my $key (sort keys %{$$self{stats}{samples}})
    {
        if ( !exists($$self{stats}{samples}{$key}{indel_count}) ) { next; }
        $out .= sprintf "%d\tsamples/%s\n", $$self{stats}{samples}{$key}{indel_count},$key;
    }
    return $out;
}

sub dump_shared_counts
{
    my ($self) = @_;
    my $out = "#Shared SNPs\tFrequency\n";
    for my $key (sort {$a<=>$b} keys %{$$self{stats}{all}{shared}})
    {
        $out .= sprintf "%d\t%s\n", $key,$$self{stats}{all}{shared}{$key};
    }
    return $out;
}

sub dump_private_counts
{
    my ($self) = @_;
    my $out = "#Private SNPs\tSample\n";
    for my $key (sort keys %{$$self{stats}{samples}})
    {
        if ( !exists($$self{stats}{samples}{$key}{private}) ) { next; }
        $out .= sprintf "%d\t%s\n", $$self{stats}{samples}{$key}{private},$key;
    }
    return $out;
}

sub _init_path
{
    my ($self,$prefix) = @_;
    if ( $prefix=~m{/} )
    {
        # A directory should be created. This will populate dir and prefix, for example
        #   prefix  -> dir      prefix
        #   ----------------------------
        #   out                 out.dump
        #   out/       out/     out/out.dump
        #   out/xxx    out/     out/xxx.dump 
        #
        my $dir = '';
        if ( $prefix=~m{/[^/]+$} ) { $dir=$`; }
        elsif ( $prefix=~m{/([^/]+)/$} ) { $dir = $`.'/'.$1; $prefix = $dir.'/'.$1; }
        elsif ( $prefix=~m{([^/]+)/?$} ) { $dir=$1; $prefix=$dir.'/'.$1; }
        if ( $dir ) { `mkdir -p $dir`; }
    }
    return $prefix;
}

sub legend
{
    my ($self) = @_;
    return q[
count
    Number of positions with known genotype

nalt_X
    Number of monoallelic (X=0), biallelic (X=1), etc. sites
    
ref, ref_count
    Number of sites containing reference allele

shared
    Number of sites having a non-reference allele in 0,1,2,etc samples

snp_count
    Number of positions with SNPs
    ];
}



=head2 save_stats

    About   : Save all collected stats to files
    Usage   : 
    Args    : The prefix of output files. Non-existent directories will be created.
    Returns : N/A

=cut

sub save_stats
{
    my ($self,$prefix) = @_;

    if ( !defined $prefix )
    {
        print $self->dump();
        return;
    }

    my $path = $self->_init_path($prefix);
    $self->_write_file($path.'.legend', $self->legend());
    $self->_write_file($path.'.dump', $self->dump());
    $self->_write_file($path.'.tstv', $self->dump_tstv($$self{stats}));
    $self->_write_file($path.'.counts', $self->dump_counts());
    $self->_write_file($path.'.snps', $self->dump_snp_counts());
    $self->_write_file($path.'.indels', $self->dump_indel_counts());
    $self->_write_file($path.'.qual-tstv',$self->dump_qual_tstv);
    $self->_write_file($path.'.shared',$self->dump_shared_counts());
    $self->_write_file($path.'.private',$self->dump_private_counts());

    if ( exists($$self{stats}{samples}) )
    {
        $self->_write_file($path.'.samples-tstv',$self->dump_tstv($$self{stats}{samples}));
    }
}


sub _write_file
{
    my ($self,$fname,$text) = @_;
    open(my $fh,'>',$fname) or $self->throw("$fname: $!");
    print $fh $text;
    close($fh);
}


1;
vcftools 0.1.12+dfsg-1 / usr / share / perl5 / VcfStats.pm