/usr/bin/vcf-phased-join

#!/usr/bin/env perl

use strict;
use warnings;
use Carp;
use Vcf;

my $opts = parse_params();
if ( $$opts{split_size} ) 
{
    split_vcf($opts);
}
else
{
    join_vcfs($opts);
}

exit;

#--------------------------------

sub error
{
    my (@msg) = @_;
    if ( scalar @msg )
    {
        croak @msg;
    }
    die
        "About: The script takes multiple overlapping pre-phased chunks and concatenates them into one VCF\n",
        "   using heterozygous calls from the overlaps to determine correct phase.\n",
        "Usage: vcf-phased-join [OPTIONS] A.vcf B.vcf C.vcf\n",
        "Options:\n",
        "   -j, --min-join-quality <num>    Quality threshold for gluing the pre-phased blocks together [10]\n",
        "   -l, --list <file>               List of VCFs to join.\n",
        "   -o, --output <file>             Output file name. When \"-\" is supplied, STDOUT and STDERR will be used\n",
        "   -q, --min-PQ <num>              Break pre-phased segments if PQ value is lower in input VCFs [0.6]\n",
        "   -h, -?, --help                  This help message\n",
        "\n";
}


sub parse_params
{
    $0 =~ s{^.+/}{}; $0 .= "($Vcf::VERSION)";
    my $opts = 
    {
        args => [$0, @ARGV],
        min_join_quality => 10,
        min_PQ => 0.6,
        min_BP => 1,
    };
    while (defined(my $arg=shift(@ARGV)))
    {
        if ( $arg eq '-o' || $arg eq '--output' ) { $$opts{output}=shift(@ARGV); next; }
        if ( $arg eq '-j' || $arg eq '--min-join-quality' ) { $$opts{min_join_quality}=shift(@ARGV); next; }
        if ( $arg eq '-q' || $arg eq '--min-PQ' ) { $$opts{min_PQ}=shift(@ARGV); next; }
        if ( $arg eq '-l' || $arg eq '--list' ) { $$opts{list}=shift(@ARGV); next; }
        if (                 $arg eq '--min-BP' ) { $$opts{min_BP}=shift(@ARGV); next; }
        if (                 $arg eq '--split-size' ) { $$opts{split_size}=shift(@ARGV); next; }
        if (                 $arg eq '--split-noise' ) { $$opts{split_noise}=shift(@ARGV); next; }
        if (                 $arg eq '--split-overlap' ) { $$opts{split_overlap}=shift(@ARGV); next; }
        if (                 $arg eq '--split-prefix' ) { $$opts{split_prefix}=shift(@ARGV); next; }
        if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
        if ( -e $arg ) { push @{$$opts{vcfs}}, $arg; next; }
        error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n");
    }
    if ( exists($$opts{list}) )
    {
        open(my $fh,'<',$$opts{list}) or error("$$opts{list}: $!");
        while (my $line=<$fh>)
        {
            if ($line=~/^\s*$/) { next; }
            $line =~ s/^\s*//;
            $line =~ s/\s*$//;
            if ( ! -e $line ) { error("Some of the files in $$opts{list} do not exist\n"); }
            push @{$$opts{vcfs}},$line;
        }
        close($fh);
    }
    if ( !exists($$opts{vcfs}) ) { error("No VCF file given"); }
    if ( !exists($$opts{split_size}) )
    {
        if ( @{$$opts{vcfs}}<1 ) { error("No input VCF given?\n"); }
        if ( @{$$opts{vcfs}}<2 ) 
        { 
            warn("Only one input VCF given, running in --min-PQ splitting mode.\n"); 
            if ( $$opts{min_PQ}<=0.5 ) { warn("You better know what you're doing: --min-PQ set too low, will hardly find any split!"); }
        }
        if ( !exists($$opts{output}) ) { error("No output VCF file name given"); }
    }
    return $opts;
}

sub split_vcf
{
    my ($opts) = @_;
    my $vcf = Vcf->new(file=>$$opts{vcfs}[0]);
    $vcf->parse_header();
    $$opts{vcf} = $vcf;

    my ($fh_next,$swap_next) = open_next_file($opts);
    my ($fh,$prev_boundary,$start_pos,@buffer,$prev_chr,$prev_pos,$swap);
    while (my $rec=$vcf->next_data_array)
    {
        my $rec_next = [];
        for my $col (@$rec) { push @{$rec_next}, "$col"; }

        my $chr = $$rec[0];
        my $pos = $$rec[1];
        if ( defined $prev_chr && $prev_chr ne $chr ) { last; }
        if ( defined $prev_pos && $pos<=$prev_pos ) { error("Not sorted or duplicate position: $chr:$prev_pos vs $chr:$pos"); }
        $prev_pos = $pos;
        $prev_chr = $chr;
        if ( !defined $start_pos ) { $start_pos = $pos; }

        my $bnd = $start_pos + int(($pos-$start_pos)/$$opts{split_size})*$$opts{split_size};
        if ( $start_pos!=$bnd && abs($pos-$bnd)*2 <= $$opts{split_overlap} ) 
        { 
            # Known boundary
            if ( defined $fh_next ) { print $fh_next randomize($opts,$swap_next,$vcf,$rec_next,$pos-$bnd+$$opts{split_overlap}/2); }
            if ( defined $fh ) { print $fh randomize($opts,$swap,$vcf,$rec,$bnd+$$opts{split_overlap}/2-$pos); }
            next;
        }

        $bnd += $$opts{split_size};
        if ( abs($pos-$bnd)*2 >= $$opts{split_overlap} )
        {
            print $fh_next swap_gts($vcf,$swap_next,$rec);
            next;
        }

        # New boundary
        if ( !defined $prev_boundary || $prev_boundary ne $bnd )
        {
            close($fh) unless !defined $fh;
            $fh   = $fh_next;
            $swap = $swap_next;
            $prev_boundary = $bnd;
            $fh_next = undef;
        }
        if ( !defined $fh_next )
        {
            ($fh_next,$swap_next) = open_next_file($opts);
            $prev_boundary = $bnd;
        }
        if ( defined $fh_next ) { print $fh_next randomize($opts,$swap_next,$vcf,$rec_next,$pos-$bnd+$$opts{split_overlap}/2); }
        if ( defined $fh ) { print $fh randomize($opts,$swap,$vcf,$rec,$bnd+$$opts{split_overlap}/2-$pos); }
    }
    if ( defined $fh ) { close($fh); }
    if ( defined $fh_next ) { close($fh_next); }
}

sub open_next_file
{
    my ($opts) = @_;

    $$opts{split_ifile}++;
    my $fname = sprintf "%s%02d.vcf", $$opts{split_prefix},$$opts{split_ifile};
    open(my $fh,'>',$fname) or error("$fname: $!");
    print $fh $$opts{vcf}->format_header;

    my @swap;
    for (my $i=9; $i<@{$$opts{vcf}{columns}}; $i++)
    {
        if ( $$opts{split_ifile}==1 )
        {
            $swap[$i-9] = -1;
        }
        else
        {
            $swap[$i-9] = int(rand(2)) ? 1 : -1;
        }
        if ( $swap[$i-9]==1 )
        {
            printf "%s\t%s\tswapped\n",$fname,$$opts{vcf}{columns}[$i];
        }
    }

    return ($fh,\@swap);
}

sub randomize
{
    my ($opts,$swap,$vcf,$rec,$dist) = @_;

    if ( $dist>$$opts{split_overlap} ) { $dist = $$opts{split_overlap}; }
    my $noise = $dist/$$opts{split_overlap};
    if ( exists($$opts{split_noise}) ) { $noise = $$opts{split_noise}; }
    my $na = 2 * (scalar @$rec - 9);
    my $nchanged = int($na*$noise);
    if ( !$nchanged ) { return swap_gts($vcf,$swap,$rec); }

    use List::Util 'shuffle';
    my @errors = (1) x $nchanged;
    if ( $nchanged<$na ) { @errors = (@errors, (0) x ($na-$nchanged)); }
    @errors = shuffle(@errors);

    print "$$rec[1] .. dist=$dist, changed=$nchanged total=$na ($noise)\n";

    my $itag = $vcf->get_tag_index($$rec[8],'GT',':');
    my $i = -2;
    for (my $isample=9; $isample<@$rec; $isample++)
    {
        $i += 2;
        if ( !$errors[$i] && $errors[$i+1] ) { next; }

        my $gt = $vcf->get_field($$rec[$isample],$itag);
        my ($a1,$a2) = $vcf->split_gt($gt);
        if ( $errors[$i] ) { $a1 = $a1 ? 0 : 1; }
        if ( $errors[$i+1] ) { $a2 = $a2 ? 0 : 1; }
        $$rec[$isample] = $vcf->replace_field($$rec[$isample],"$a1|$a2",$itag,':');
    }

    return swap_gts($vcf,$swap,$rec);
}

sub swap_gts
{
    my ($vcf,$swap,$rec) = @_;
    my $igt = $vcf->get_tag_index($$rec[8],'GT',':'); 
    my $gts = $vcf->get_sample_field($rec,$igt);
    for (my $i=0; $i<@$gts; $i++)
    {
        if ( $$swap[$i]==-1 ) { next; }
        my ($a1,$a2) = $vcf->split_gt($$gts[$i]);
        $$rec[$i+9] = $vcf->replace_field($$rec[$i+9],"$a2|$a1",$igt,':');
    }
    return $vcf->format_line($rec);
}

sub check_columns
{
    my ($opts) = @_;
    my @columns;
    for my $file (@{$$opts{vcfs}})
    {
        my $vcf = Vcf->new(file=>$file);
        $vcf->parse_header();

        if ( @columns )
        {
            if ( @columns != @{$$vcf{columns}} ) { warn("Different number of columns in [$file].\n"); }
            for (my $i=0; $i<@columns; $i++)
            {
                if ( $$vcf{columns}[$i] ne $columns[$i] ) { warn("The column names do not agree in [$file].\n"); last; }
            }
        }
        else
        {
            @columns = @{$$vcf{columns}};
        }
        $vcf->close();
    }
    $$opts{nsamples} = @columns-9;
}

sub log_msg
{
    my ($opts,@msg) = @_;
    print {$$opts{log_fh}} @msg;
}

sub next_vcf_file
{
    my ($opts) = @_;
    if ( !exists($$opts{ifile}) ) { $$opts{ifile}=-1; }
    my $chr  = $$opts{current_chr};
    my @vcfs = @{$$opts{chroms}{$chr}};
    while (1)
    {
        $$opts{ifile}++;
        if ( $$opts{ifile} >= @vcfs ) { return (undef,undef); }
        $$opts{ivcf_fname} = $vcfs[$$opts{ifile}];
        my $vcf = Vcf->new(file=>$$opts{ivcf_fname}, region=>$chr, print_header=>1);
        $vcf->parse_header();
        my $rec = $vcf->next_data_array();
        if ( !defined $rec ) { next; }
        return ($vcf,$rec);
    }
}

sub join_vcfs
{
    my ($opts) = @_;

    # Determine the chromosomes
    for my $vcf (@{$$opts{vcfs}})
    {
        my @chroms = `tabix -l $vcf`;
        if ( $? )
        { 
            error(qq[The command "tabix -l $vcf" exited with an error. Is the file tabix indexed?\n]);
        }
        if ( !@chroms )
        {
            warn(qq[Warning: Is the VCF file $vcf empty?\n]);
        }
        for my $chr (@chroms)
        {
            chomp($chr);
            push @{$$opts{chroms}{$chr}},$vcf;
        }
    }

    check_columns($opts);
    $$opts{phased_blocks} = [ (0) x $$opts{nsamples} ];
    $$opts{broken_blocks} = [ (0) x $$opts{nsamples} ];

    for my $chr (sort keys %{$$opts{chroms}})
    {
        $$opts{current_chr} = $chr;
        join_vcfs_chr($opts);
    }
    report_stats($opts);
}

sub join_vcfs_chr
{
    my ($opts) = @_;

    delete($$opts{ifile});
    $$opts{swapped} = [ (0) x $$opts{nsamples} ];
    $$opts{phasing_set} = [ (0) x $$opts{nsamples} ];

    my ($vcf1,$rec1) = next_vcf_file($opts);
    if ( !defined $rec1 ) { error("Broken/Empty VCFs?"); }

    if ( $$opts{output} ne '-' )
    {
        my $logfile = $$opts{output};
        if ( $$opts{output}=~/\.[^\.]+$/ ) { $logfile = $`; }
        $logfile .= '.plog';
        open($$opts{log_fh},'>',$logfile) or error("$logfile: $!");
        open($$opts{out_fh},'>',$$opts{output}) or error("$$opts{output}: $!");
    }
    else
    {
        $$opts{log_fh} = \*STDERR;
        $$opts{out_fh} = \*STDOUT;
    }
    $$opts{vcf} = $vcf1;
    if ( !$$opts{header_printed} )
    {
        $$opts{header_printed} = 1;

        $$opts{vcf}->add_header_line({key=>'FORMAT',ID=>'PS',Number=>1,Type=>'Integer',Description=>'Phase set'});
        $$opts{vcf}->add_header_line({key=>'source',value=>join(' ',@{$$opts{args}})},append=>'timestamp');
        print {$$opts{out_fh}} $$opts{vcf}->format_header();

        log_msg($opts, "# This file was generated by vcf-phased-join.\n");
        log_msg($opts, "# The command line was: ", join(' ',@{$$opts{args}}), "\n");
        log_msg($opts, "#\n");
        log_msg($opts, "#PS 'Phasing Summary'. Use `grep ^PS | cut -f 2-` to extract this part.\n");
        log_msg($opts, "#PS The columns are:\n");
        log_msg($opts, "#PS         1,2 .. the pair of files being joined\n");
        log_msg($opts, "#PS           3 .. the overlapping region used for determining the phase\n");
        log_msg($opts, "#PS           4 .. sample name\n");
        log_msg($opts, "#PS           5 .. did a swap occur?\n");
        log_msg($opts, "#PS           6 .. quality of phase assignment\n");
        log_msg($opts, "#PS           7 .. number of het genotypes used for phasing\n");
        log_msg($opts, "#PS         8,9 .. log10 likelihood of phase match/mismatch\n");
    }
    $$opts{file1} = $$opts{ivcf_fname};

    my ($vcf2,$rec2) = next_vcf_file($opts);
    if ( !defined $rec2 )
    {
        # Only one non-empty VCF file present, running in --min-PQ splitting mode
        while ( defined($rec1 = $vcf1->next_data_array()) )
        {
            output_line($opts,$rec1,$$opts{swapped});
        }
        return;
    }
    else
    {
        $$opts{file2} = $$opts{ivcf_fname};
        if ( wrong_order($opts,$rec1,$rec2) )
        {
            $vcf1->_unread_line($rec1);
            $rec1 = $rec2;
        }
    }

    my @buffer;
    while (1)
    {
        # is vcf1 ahead of vcf2?
        while ( $$rec1[1] < $$rec2[1] )
        {
            output_line($opts,$rec1,$$opts{swapped});
            $rec1 = $vcf1->next_data_array();
            if ( !defined $rec1 ) { last; }
            if ( wrong_order($opts,$rec1,$rec2) )
            {
                $vcf1->_unread_line($rec1);
                $rec1 = $rec2;
            }
        }

        if ( defined $rec1 )
        {
            while ( $$rec1[1] eq $$rec2[1] )
            {
                push @buffer, [$rec1,$rec2];
                $rec1 = $vcf1->next_data_array();
                $rec2 = $vcf2->next_data_array();
                if ( !defined $rec1 ) { last; }
                if ( !defined $rec2 ) { error("The file $$opts{file1} ended before $$opts{file2}."); }
                if ( wrong_order($opts,$rec1,$rec2) )
                {
                    $vcf1->_unread_line($rec1);
                    $rec1 = $rec2;
                }
            }
            if ( defined $rec1 && $$rec1[1] ne $$rec2[1] )
            {
                error("ERROR\tThe lines out of sync: $$rec1[0]:$$rec1[1] in (1) vs $$rec2[0]:$$rec2[1] in (2), where (1)=$$opts{file1} (2)=$$opts{file2}\n");
            }
        }

        # is vcf1 done?
        if ( !defined $rec1 )
        {
            flush_buffer($opts,$vcf1,\@buffer);

            $vcf1->close();

            if ( !defined $rec2 )
            {
                # Yes, this can happen when file1 ends exactly where file2 does
                $vcf2->close();
                ($vcf2,$rec2) = next_vcf_file($opts);
                if ( !defined $rec2 ) { last; }
            }
            $vcf1 = $vcf2;
            $rec1 = $rec2;
            $$opts{file1} = $$opts{ivcf_fname};

            ($vcf2,$rec2) = next_vcf_file($opts);
            if ( !defined $rec2 ) { last; }
            $$opts{file2} = $$opts{ivcf_fname};
            if ( wrong_order($opts,$rec1,$rec2) )
            {
                $vcf1->_unread_line($rec1);
                $rec1 = $rec2;
            }
            next;
        }
    }

    if ( @buffer ) { flush_buffer($opts,$vcf1,\@buffer); }
    do
    {
        output_line($opts,$rec1,$$opts{swapped}) unless !defined $rec1;
    }
    while ( exists($$vcf1{fh}) && defined($rec1 = $vcf1->next_data_array()) );
}

sub wrong_order
{
    my ($opts,$rec1,$rec2) = @_;
    if ( $$rec1[0] ne $$rec2[0] ) { error("Encountered different chromosomes in $$opts{file1} and $$opts{file2}: \"$$rec1[0]:$$rec1[1]\" vs \"$$rec2[0]:$$rec2[1]\"\n"); }
    if ( $$rec1[1] > $$rec2[1] ) 
    { 
        log_msg($opts,"WARNING\tThe lines out of sync: $$rec1[0]:$$rec1[1] in (1) vs $$rec2[0]:$$rec2[1] in (2), where (1)=$$opts{file1} (2)=$$opts{file2}\n");
        return 1; 
    }
    return 0;
}

sub output_line
{
    my ($opts,$rec,$swap) = @_;
    
    my $vcf = $$opts{vcf};
    my $igt = $vcf->get_tag_index($$rec[8],'GT',':'); 
    my $ips = $vcf->get_tag_index($$rec[8],'PS',':');
    if ( $ips==-1 ) { $$rec[8] .=  ':PS'; }
    my $ipq = exists($$opts{min_PQ}) ? $vcf->get_tag_index($$rec[8],'PQ',':') : -1;

    my $breakpoints = 0;
    for (my $i=0; $i<@$swap; $i++)
    {
        if ( $$swap[$i]==1 ) 
        { 
            my $gt = $vcf->get_field($$rec[$i+9],$igt);
            my ($a1,$a2) = $vcf->split_gt($gt);
            if ( defined $a2 )
            {
                $$rec[$i+9] = $vcf->replace_field($$rec[$i+9],"$a2|$a1",$igt,':');
            }
        }

        if ( $ipq!=-1 )
        {
            my $pq = $vcf->get_field($$rec[$i+9],$ipq);
            if ( $pq ne '.' && $pq<$$opts{min_PQ} ) 
            { 
                $$opts{phasing_set}[$i]=0;
                $$opts{broken_blocks}[$i]++;
            }
        }

        if ( !$$opts{phasing_set}[$i] ) 
        { 
            $$opts{phasing_set}[$i] = $$rec[1]; 
            $$opts{phased_blocks}[$i]++; 
            $breakpoints++;
        }
        if ( $ips==-1 ) 
        { 
            $$rec[$i+9] .= ':'.$$opts{phasing_set}[$i]; 
        }
        else 
        { 
            $$rec[$i+9] = $vcf->replace_field($$rec[$i+9],$$opts{phasing_set}[$i],$ips,':');
        }
    }
    print {$$opts{out_fh}} $vcf->format_line($rec);

    $breakpoints *= 100./$$opts{nsamples};
    if ( $breakpoints>$$opts{min_BP} )
    {
        push @{$$opts{breakpoints}}, sprintf("BP\t%s\t%d\t%.1f\n", $$rec[0],$$rec[1],$breakpoints);
    }
}

sub flush_buffer
{
    my ($opts,$vcf,$buffer) = @_;
    if ( !@$buffer ) 
    { 
        $$opts{phasing_set} = [ (0) x $$opts{nsamples} ];
        return;
    }

    my $chr  = $$buffer[0][0][0];
    my $from = $$buffer[0][0][1];
    my $to   = $$buffer[-1][0][1];

    # Determine likelihoods of genotypes being swapped 
    my @lks_match = ();
    my @lks_mism  = ();
    my @nsites = (0) x $$opts{nsamples};
    for my $site (@$buffer)
    {
        my $rec1 = $$site[0];
        my $rec2 = $$site[1];

        my $igt1 = $vcf->get_tag_index($$rec1[8],'GT',':'); 
        my $igt2 = $vcf->get_tag_index($$rec2[8],'GT',':'); 
        my $gts1 = $vcf->get_sample_field($rec1,$igt1);
        my $gts2 = $vcf->get_sample_field($rec2,$igt2);

        my $ngts = $$opts{nsamples};
        my $nerrors  = 0;
        my (@als1,@als2,@phased);
        for (my $i=0; $i<@$gts1; $i++)
        {
            if ( index($$gts1[$i],'|')==-1 or index($$gts2[$i],'|')==-1 )
            {
                push @phased, 0;
            }
            else
            {
                push @phased, 1;
            }
            my ($a1,$a2) = $vcf->split_gt($$gts1[$i]);
            my ($b1,$b2) = $vcf->split_gt($$gts2[$i]);
            if ( !defined $a2 ) { $a2 = $a1; }  # haploid genotypes
            if ( !defined $b2 ) { $b2 = $b1; }
            if ( !(($a1 eq $b1 && $a2 eq $b2) or ($a1 eq $b2 && $a2 eq $b1)) ) { $nerrors++ }
            push @als1, $a1,$a2;
            push @als2, $b1,$b2;
        }

        my $dist = $to-$$site[0][1] < $$site[0][1]-$from ? $to-$$site[0][1] : $$site[0][1]-$from;
        $$opts{dist_errors}{$nerrors}++;

        my $p = $nerrors/$ngts;
        if ( $p==0 ) { $p=1./$ngts; }
        elsif ( $p==1 ) { $p=1 - 1./$ngts; }

        for (my $i=0; $i<@$gts1; $i++)
        {
            if ( !$phased[$i] ) { next; }

            my $a1 = $als1[2*$i];
            my $a2 = $als1[2*$i+1];
            my $b1 = $als2[2*$i];
            my $b2 = $als2[2*$i+1];

            if ( $a1 eq $a2 or $b1 eq $b2 ) { next; }   # homozygous GT

            if ( $a1 eq $b1 && $a2 eq $b2 ) 
            { 
                #print STDERR "$i .. counting match $a1/$a2 $b1/$b2\n";
                $lks_match[$i] += log($p*$p + (1-$p)*(1-$p));
                $lks_mism[$i] += log($p*(1-$p) + (1-$p)*$p);
            }
            elsif ( $a1 eq $b2 && $a2 eq $b1 )
            {
                #print STDERR "$i .. counting mismatch $a1/$a2 $b1/$b2\n";
                $lks_match[$i] += log($p*(1-$p) + (1-$p)*$p);
                $lks_mism[$i] += log($p*$p + (1-$p)*(1-$p));
            }
            else { next; } # different alleles might have been selected at multiallelic sites

            $nsites[$i]++;
        }
    }

    my $file1 = $$opts{file1};
    my $file2 = $$opts{file2};
    my @swapped = ( (0) x $$opts{nsamples} );
    my @quals = ();
    my $log10 = log(10);
    for (my $i=0; $i<$$opts{nsamples}; $i++)
    {
        if ( !defined $lks_match[$i] )
        {
            $lks_match[$i] = $lks_mism[$i] = log(0.5);
        }
        $swapped[$i] = $lks_match[$i]>$lks_mism[$i] ? $$opts{swapped}[$i] : -1*$$opts{swapped}[$i];
        $quals[$i] = abs($lks_match[$i]-$lks_mism[$i])/$log10;

        log_msg($opts, sprintf "PS\t%s\t%s\t$chr:$from-$to\t%s\t%d\t%.1f\t%d\t%f\t%f\n", $file1,$file2, $$vcf{columns}[$i+9],
            $swapped[$i]==-1?0:1, $quals[$i], $nsites[$i], $lks_match[$i]/$log10, $lks_mism[$i]/$log10);
    }

    # Do not allow segment breaking while processing the buffer: this may help sometimes, but may also make things worse.
    my $min_PQ = $$opts{min_PQ}; 
    delete($$opts{min_PQ});

	# In case there is no overlap, reset the phasing set
	if ( !@quals ) { $$opts{phasing_set} = [ (0) x $$opts{nsamples} ]; }

    # Output the VCF line and quality
    for my $site (@$buffer)
    {
        # Which of the two overlapping VCF lines to output? Take the one farther from the end.
        my ($rec,$swap);
        if ( $to-$$site[0][1] > $$site[0][1]-$from )
        {
            $rec = $$site[0];
            $swap = $$opts{swapped};
        }
        else
        {
            # Update the phasing set ID
            if ( @quals )
            {
                for (my $i=0; $i<@quals; $i++)
                {
                    if ( $quals[$i] < $$opts{min_join_quality} ) { $$opts{phasing_set}[$i]=0; }
                }
                @quals = ();
            }
            $rec = $$site[1];
            $swap = \@swapped;
        }
        output_line($opts,$rec,$swap);
    }

    $$opts{min_PQ} = $min_PQ; 

    @$buffer = ();
    $$opts{swapped} = \@swapped;
}

sub report_stats
{
    my ($opts) = @_;

    log_msg($opts, "#NS Number of phased segments. Use `grep ^NS | cut -f 2-` to extract this part.\n");
    log_msg($opts, "#NS The columns are:\n");
    log_msg($opts, "#NS         1 .. sample\n");
    log_msg($opts, "#NS         2 .. number of phased blocks\n");
    log_msg($opts, "#NS         3 .. number of blocks created because of low PQ\n");
    log_msg($opts, "#NS         4 .. number of blocks created because of low joining quality\n");
    for my $i (sort { $$opts{phased_blocks}[$b] <=> $$opts{phased_blocks}[$a] } (0..($$opts{nsamples}-1)))
    {
        log_msg($opts,sprintf "NS\t%s\t%d\t%d\t%d\n", $$opts{vcf}{columns}[9+$i],$$opts{phased_blocks}[$i],$$opts{broken_blocks}[$i],$$opts{phased_blocks}[$i]-$$opts{broken_blocks}[$i]);
    }

    log_msg($opts, "#BP Break Points. Use `grep ^BP | cut -f 2-` to extract this part.\n");
    log_msg($opts, "#BP The columns are:\n");
    log_msg($opts, "#BP         1 .. chromosome\n");
    log_msg($opts, "#BP         2 .. position\n");
    log_msg($opts, "#BP         3 .. percent of samples with breakpoint at that position\n");
    for my $break (@{$$opts{breakpoints}})
    {
        log_msg($opts,$break);
    }

    log_msg($opts, "#ED Error Distribution. Use `grep ^ED | cut -f 2-` to extract this part.\n");
    log_msg($opts, "#ED The columns are:\n");
    log_msg($opts, "#ED         1 .. number of GT mismatches per site not attributable to phasing\n");
    log_msg($opts, "#ED         2 .. frequency \n");
    for my $nerrors (sort {$a<=>$b} keys %{$$opts{dist_errors}})
    {
        log_msg($opts, "ED\t$nerrors\t$$opts{dist_errors}{$nerrors}\n");
    }
}
vcftools 0.1.15-1 / usr / bin / vcf-phased-join