/usr/bin/mcmc_analysis

#! /usr/bin/perl -w

use POSIX qw(floor);
#use Math::Libm 'log1p';

my $do_plotting = 0;
my $do_coda_output = 0;
my $do_coda_trees = 0;
my $count_lines = 0;		# Only count the number of samples
my $forget_the_trees = 0;	# We can skip trees, since they can be quite voluminous.
my $plot_param = undef;
my $max_n_points = 100;		# The max number of points in a plot;
my $parallel_chains = 0;	# Indicate that input chains are parallel
my $show_points = 0;		# Should the points be explicitly indicated?

my $burnin = 0.1;
my $format = 'txt';		# Choose how to produce output. txt/latex.

my $best_tree_only = 0;
my %param_handlers = ( float    => \&analyze_float,
		       logfloat => \&analyze_logfloat,
		       integer  => \&analyze_integer,
		       tree     => \&analyze_tree,
		       orthologypairs => \&analyze_orthologypairs,
		       reconciliation => \&analyze_reconciliation,
		     );

# The following types are understood by CODA (an R package):
my %coda_types = (float => undef,
		  logfloat => undef,
		  integer => undef,
		  );

# This hash contains the column names that should be ignored in the output.
my %ignoramus = ();

my $usage = "Usage: mcmc_analysis [<options>] <MCMC data> ...

This program reads MCMC output, from one or several files, from
programs in the BEEP package. It is important that the same columns
are present in each input file.

Options:
   -b [<float>|<int>]
               The percentage (0 <= x <1), or the number of (x is integer >= 1)
               of the input to be discarded as burnin. Default: $burnin.

   -p <string> \"Plot\" the parameter named <string>. Output is two columns,
               the iteration number and the parameter's value in the iteration.

   -i <string> Ignore the named parameter. See header line in MCMC output for
               column names. You can name several columns at once using a comma-
               delimited format, e.g.
                 -i Length,Name
               No spaces allowed between column names!

   -t          Output LaTex for the analysis.

   -l          Count and report the number of samples in the input file.
   -mp <int>   The maximum number of points to plot.
   -sp         Explicitly indicate points in plots.

   -coda       Output file for the CODA package in R.
   -codatrees  Same as -coda, but includes tree parameters. Each tree is output
               as an integer ID (order of visitation in chain, 1,2,...). Make
               sure to use -i to hide any trees containing lengths/times/rates.

   -P          Parallel chains. This implies that several files of (parallel)
               samples are listed.


In:  Output from MCMC iterations on the format
        <likelihood> <tab> <iter> <tab> <params>
     where 
        <likelihood> is the logarithm of the likelihood in float format
        <tab>        is tab whitespace separating the fields
        <iter>       is an integer for the ordinal of the iteration
        <params>     is a list of fields separated by semicolons containing
                     the parameters of the MCMC.

     The MCMC params are typed and given names by the first line in the file, 
     which is on the format
        # L <tab> N <tab> [<name>(<type>)]+
     The names ought to be unique, but do not have to be. The <type> is one 
     of
        float
        logfloat
        integer
        tree
        orthologypairs
     and used to infer how to parse and analyze the rest of the lines.

Out: A report on the MCMC run, with posterior estimates of the parameters.

";


if (scalar(@ARGV) > 0) {
  while ($ARGV[0] =~ m/^-\S+/) {
    my $opt = shift @ARGV;
    if ($opt eq '-b') {
      if (@ARGV < 2) {
	print STDERR "Too few arguments to mcmc_analysis!\n";
	exit 7;
      }
      my $opt = shift @ARGV;
      if ($opt =~ m/^(0\.\d+)$/) {
	$burnin = $1;
      } elsif ($opt =~ m/^[1-9]\d*$/) {
	$burnin = $opt;
      } else {
	print STDERR "The burnin parameter (-b) must be in the range [0, 1),
for example '-b 0.25', or an integer >= 1.\n";
      }
    } elsif ($opt eq '-p') {
      if (@ARGV < 2) {
	print STDERR "Too few arguments to mcmc_analysis!\n";
	exit 7;
      }
      $plot_param = shift @ARGV;
      $do_plotting = 1;
    } elsif ($opt eq '-i') {
      if (@ARGV < 2) {
	print STDERR "Too few arguments to mcmc_analysis!\n";
	exit 7;
      }
      foreach my $i (split(/,/, shift @ARGV)) {
	$ignoramus{$i} = undef;
      }
    } elsif ($opt eq '-coda') {
      $do_coda_output = 1;
    } elsif ($opt eq '-codatrees') {
      $do_coda_output = 1;
      $do_coda_trees = 1;
    } elsif ($opt eq '-t') {
      $format = 'latex';
    } elsif ($opt eq '-l') {
      $count_lines = 1;
    } elsif ($opt eq '-g') {
      $forget_the_trees = 1;
    } elsif ($opt eq '-P') {
      $parallel_chains = 1;
    } elsif ($opt eq '-sp') {
      $show_points = 1;
    } elsif ($opt eq '-mp') {
      my $next_int = 0;
      if (@ARGV < 2) {
	print STDERR "Too few arguments to mcmc_analysis. Expected an integer after '-mp'.\n";
	exit 9;
      } else {
	$next_int = shift @ARGV;
	if ($next_int < 5) {
	  print STDERR "Too few points to plot. Using default instead.\n";
	} else {
	  $max_n_points = $next_int;
	}
      }
    } elsif ($opt eq '-bto') {
      $best_tree_only = 1;
    } else {
      print STDERR "Unknown option: '$opt'\n";
      exit 8;
    }
  }
}

if (@ARGV < 1) {
  print STDERR $usage;
  exit 1;
}
my $filename = shift @ARGV;


open(F, "<$filename")
  or die "mcmc_analysis: Could no open '$filename' for reading. ";

#
# Parse the input's header file with parameter names and types
#
my $commandline='';
my $jobname=undef;
my @columnheaders;
while (<F>) {			# Read all comments until the actual header line comes
  if (m/^\#\s+[LTS]\s+N\s+(\S.+)$/) { # Header!
    @columnheaders = split(/\s+/, $1);
    last;
  } elsif (m/^#\s*Running:\s*(\S.+)/) {
    $commandline = $1;
  } elsif (m/^#\s*Name:\s*(\S.+)/) {
    $jobname = $1;
  } elsif (m/^\#/) {		# Comment!
    next;
  }
}

# Basic error checking (very basic...)
my $num_columns = scalar(@columnheaders);
if ($num_columns < 2) {
  print STDERR "Bad format of MCMC file header.\n";
}


#
# Extract the info about the different input columns
#
my @names;
my @types;

for (my $i = 0; $i < $num_columns; $i++) {
  $columnheaders[$i] =~ m/(\S+)\((\S+)\)/;
#  $columnheaders[$i] =~ m/([^\(\)\s]+)\((\S+)\)/;
  my $name = $1;
  my $type = $2;

  if (! defined $1 || length $1 == 0) {
    print STDERR "An undefined name for column $i. Please give a name in header line!\n";
    exit 4;
  }

  if (! defined $2 || length $2 == 0) {
    print STDERR "Missing type for column $i. Please give a type in header line!\n";
    exit 4;
  }

  $names[$i] = $name;
  if (exists $ignoramus{$name}) {
    $types[$i] = 'ignore';
  } else {
    $types[$i] = lc($type);
  }

  if (!exists $param_handlers{$type}) {
    print STDERR "Unknown type for column $i: $type\n";
  }
}

#
# Check to see if the user asked to plot a parameter that exists!
#
if ($do_plotting) {
  my $found = 0;
  if ($plot_param ne 'L') {
    foreach my $c (@names) {
      if ($plot_param eq $c) {
	$found = 1;
	last;
      }
    }
    if ($found == 0) {
      print STDERR "There is no parameter '$plot_param' to plot. Available parameters:\n";
      foreach my $c (@names) {
	print STDERR "\t$c\n";
      }
      exit 9;
    }
  }
}


#
# Start reading the actual data and parse it according to the header.
# Data will be put in a hash tables index by the iteration number.
#
##my %L = ();			# Likelihood values
#my %I = ();			# Save iteration number here. I want to index based on lineno for some reason...
my %param = ();			# A hash of hashes: $param{$param}{$iteration}
my $line_in_file = 0;		# Current line in current file
my $sample_no = 0;		# When appending files, counts the total num of lines
my $previous_iterations = 0;	# Remember num iterations from previous files.
my $iter = 0;
my $max_L = -1000000000000000;	# Our highest likelihood, uh, max posterior probability
my $max_L_sample = 0;		# The index of the best probability solution (max posterior prob).
my @filelist = ($filename);

my $bad_lines = 0;		# Track the number of strange rows.

# In order to be able to read several parallel chains, we want to be able to throw away
# burnin from all of them and not just concatenate the data (which we still want to do
# in some cases). So if the parallel_chains flag is true, we do the following.
#  - If burnin is given as a fraction (say, 10 %), then after the first file we calculate
#    what that means in actual number of lines.
#  - Brutally remove the burnin from the reading of input! This means that we do not get
#    the burnin plotted for anything but the first file.
#  - Treat the parallel chains as if we can concatenate them.
#
# Later on, we have to change this code to treat each chain separately to be able to do
# convergence testing etc.
my $reading_burnin = 0;
my $the_actual_burnin = 0;
do {
  while (<F>) {
    chomp;
    $line_in_file++;
    if (m/^\s*#/) {
      next;
    }

    if ($parallel_chains && $reading_burnin > 0) {
      $reading_burnin--;
      next;
    } else {
      $sample_no++;
    }

    if (m/(^-?\d[\d\.ef\-\+]*)\s+(\d+)\s+(\S.+)$/) {
      my $likelihood = $1;
      if ($likelihood > $max_L) {
	$max_L = $likelihood;
	$max_L_sample = $sample_no;
      }

      $iter = $2 + $previous_iterations;
      my @parameters = split(/;\s*/, $3);

      my $nparams = scalar(@parameters);
      if ($nparams != $num_columns) {
	$bad_lines++;
	print STDERR "The number of columns ($nparams) on line $line_in_file, iteration $iter, does no match the header ($num_columns)!\n";
	if ($bad_lines > 4) {
	  print STDERR "Giving up: Too many bad lines in $filename.\n";
	  exit 5;
	} else {
	  next;
	}
      }

      $param{'L'}{$sample_no} = $likelihood;
      $param{'iteration'}{$sample_no} = $iter;
      #    $I{$sample_no} = $iter;

      for (my $i = 0; $i < $num_columns; $i++) {
	$types[$i] = lc($types[$i]);
	# floating point numbers
	if ($types[$i] eq 'float') {
	  if ($parameters[$i] =~ m/^(-?\d[\d\.ef\-\+]*)$/) {
	    $param{$names[$i]}{$sample_no} = $1;
	  } else {
	    type_mismatch('float', $parameters[$i], $filename, $line_in_file, $i, $iter);
	  }
	} elsif ($types[$i] eq 'logfloat') {
	  # floats stored in logarithmic form
	  if ($parameters[$i] =~ m/^(-?\d[\d\.ef\-\+]*)$/) {
	    $param{$names[$i]}{$sample_no} = $1;
	  } else {
	    type_mismatch('logfloat', $parameters[$i], $filename, $line_in_file, $i, $iter);
	  }
	} elsif ($types[$i] eq 'integer') {
	  # Common integers
	  if ($parameters[$i] =~ m/^-?(\d+)$/) {
	    $param{$names[$i]}{$sample_no} = $1;
	  } else {
	    type_mismatch('integer', $parameters[$i], $filename, $line_in_file, $i, $iter);
	  }
	} elsif ($types[$i] eq 'tree') {
	  # Trees in plain Newick format
	  if ($forget_the_trees) {
	    next;
	  }
	  if ($parameters[$i] =~ m/^(\(.+\)(:\d+(\.\d+)?(e[+-]?\d+)?)?)(\[[^\]]+\])?$/ 
	      || $parameters[$i] =~ m/^([^\(\)\ ]+)$/) {
	    $param{$names[$i]}{$sample_no} = $1;
	  } else {
	    type_mismatch('tree', $parameters[$i], $filename, $line_in_file, $i, $iter);
	  }
	} elsif ($types[$i] eq 'orthologypairs') {
	  # Orthology predictions
	  if ($parameters[$i] =~ m/^\[((\[\S+,\S+\]\s*=\s*(1|0|(0\.\d+)|(\d\.\d+e\-\d+))\s*)*)\]$/) {
	    #	if ($parameters[$i] =~ m/^\[((\[(\S(?!\]))+,(\S(?!\]))+\S\]\s*=\s*(1|(0\.\d+)|(\d\.\d+e\-\d+))\s*)*)\]$/) {
	    $param{$names[$i]}{$sample_no} = $1;
	  } else {
	    type_mismatch('orthologypairs', $parameters[$i], $filename, $line_in_file, $i, $iter);
	  }
	} elsif ($types[$i] eq 'reconciliation') {
	  # reconciliations
	  if ($parameters[$i] =~ m/^(\(.+\)?)$/) {
	    my $val = $1;
	    $val =~ s/\s+ID=\d+//g;
	    $param{$names[$i]}{$sample_no} = $val; 
	  } else {
	    type_mismatch('reconciliations', $parameters[$i], $filename, $line_in_file, $i, $iter);
	  }
	} elsif ($types[$i] eq 'ignore') { # For columns we have chosen to ignore (see options)!
	  # Nothing
	}
      }
    }
  }
  close(F);
  if ($filename = shift @ARGV) {
    push @filelist, $filename;
    open(F, "<$filename");
    $previous_iterations = $iter;
    $line_in_file = 0;

    if ($parallel_chains) {
      if($the_actual_burnin == 0){
	$the_actual_burnin = get_actual_burnin($burnin, $sample_no);
      }
      $reading_burnin = $the_actual_burnin;
    }
  }
} while ($filename);


if ($count_lines) {
  print $sample_no, "\n";
  exit;
}

if ($sample_no < 3) {
  print STDERR "Only $sample_no data points found. Too little data!\n";
  exit 1;
}


my $actual_burnin = $the_actual_burnin; # if not parallel chains this will set $actual_burnin = 0
if($actual_burnin == 0){
  $actual_burnin = get_actual_burnin($burnin, $sample_no);
}


if ($do_plotting == 1) {
#
# Output plot data
#
  foreach my $j (sort { $a <=> $b} keys %{$param{$plot_param}}) {
    if ($j >= $actual_burnin)
      {
	print $param{'iteration'}{$j}, "\t", $param{$plot_param}{$j}, "\n";
      }
  }

} elsif ($do_coda_output) {

  my @tree_num_hash_list = ();  # Per-parameter list with hashmaps of tree IDs.
  
  print "\tL\tN"; #, join("\t", @names), "\n";
  for(my $i=0; $i<scalar(@names); $i++) {
    if (exists $coda_types{$types[$i]}) {
      print "\t", $names[$i];
    }
    elsif($types[$i] eq 'tree' && $do_coda_trees == 1) {
      print "\t", $names[$i];
      my $name = $names[$i];
      my $data = \%param;
      my %tree_num = ();        # Identifies in which order the trees are found.
      my $tree_counter = 1;
      my $start = $actual_burnin + 1;
      my $N = scalar(keys(%{$data->{$name}})) - $actual_burnin;
      for (my $j = $start; $j < $start + $N; $j++) {
        my $T = $data->{$name}{$j};
        if (! exists($tree_num{$T})) {
          $tree_num{$T} = $tree_counter++;
        }
      }
      push(@tree_num_hash_list, {%tree_num});
    }
  }

  print "\n";
  foreach my $i (sort {$a <=> $b} keys %{$param{'L'}}) {
    if ($i > $actual_burnin) {
      print $i, '  ';
      print $param{'L'}{$i}, "\t";
      print $param{'iteration'}{$i}, "\t";
      my $tree_param_num = 0;
      for (my $j = 0; $j < $num_columns; $j++) {
        if (exists $coda_types{$types[$j]}) {
	  print $param{$names[$j]}{$i}, "\t";
        }
        elsif ($types[$j] eq 'tree' && $do_coda_trees == 1) {
          print $tree_num_hash_list[$tree_param_num]{$param{$names[$j]}{$i}}, "\t";
          $tree_param_num++;
	}
      }
      print "\n";
    }
  }

} else {

  #
  # Time to analyze
  #
  if($best_tree_only == 1) {
    for (my $j = 0; $j < $num_columns; $j++) {
      my $type = $types[$j];
      my $name = $names[$j];
      
      #  print STDERR "-> $type, $name, ", $param{$name}, "\n";
      if ($type eq 'tree') {
	$param_handlers{$type}->($name, $actual_burnin, $max_L_sample, $format, \%param);
      }
    }
    exit;
  }
  print_prologue($jobname, \@filelist, $commandline, $sample_no, $actual_burnin);

  # Analyze the likelihood value
  analyze_logfloat('L', $actual_burnin, $max_L_sample, $format, \%param);

  for (my $j = 0; $j < $num_columns; $j++) {
    my $type = $types[$j];
    my $name = $names[$j];

    #  print STDERR "-> $type, $name, ", $param{$name}, "\n";
    if ($type ne 'ignore') {
      $param_handlers{$type}->($name, $actual_burnin, $max_L_sample, $format, \%param);
    }

  }

  print_end();
}

### Helpers ########################################################

sub type_mismatch {
  my ($typename, $actual_param, $fname, $line_in_file, $i, $iter) = @_;

  # If we have had bad lines befor, just give up. Too much information
  # for the user is just bad.
  if ($bad_lines > 0) {
    exit 6;
  } else {
    print STDERR "Type mismatch in column $i on line $line_in_file in $fname, iteration $iter.\n";
    print STDERR "Expected '$typename', but found '$actual_param'.\n";

    # We might have read the last line in the input, which is often
    # corrupt if the computation is not yet done. In that case it is
    # OK to just drop it.
  }
}


#
# Analyze a float param
#
sub analyze_a_number {
  my ($burnin, $data) = @_;

  my $N = scalar(keys(%$data)) - $burnin;
  my @ad = ();


  # Mean
  my $sum = 0;
  my $hsum = 0;
  my $harmonic_possible = 1;	# This flag will be turned off if we encounter "0" in data.
  my $start = $burnin + 1;
  for (my $j = $start; $j < $start + $N; $j++) {
    my $dat = $data->{$j};
    $sum += $dat;
    if ($harmonic_possible) {
      if ($dat != 0) {
	$hsum += 1.0 / $dat;
      } else {
	$harmonic_possible = 0;
      }
    }
    push @ad, $dat;
  }
  my $mean = $sum / $N;
  my $harmonic_mean = undef;
  my $harmonic_sdev = undef;
  if ($harmonic_possible) {
    $harmonic_mean = $N / $hsum;
  }

  # Standard deviation
  $sum = 0;
  for (my $j = $start; $j < $start + $N; $j++) {
    $sum += ($data->{$j} - $mean)**2;
  }
  my $sdev = sqrt($sum / ($N - 1));

  # Find extreme points
  my @sda = sort {$a <=> $b} @ad;
  my $n = scalar(@sda);
  my $ma= $sda[$n-1];
  my $mi= $sda[0];

  # Baysian confidence interval, 95% and 99%.
  my $bc90 = 'N/A';
  my $bc95 = 'N/A';
  my $bc99 = 'N/A';
  if ($N >= 100) {		# Don't bother if too little data
    # Find where the mean is "located": Should be binary search here...
    # A pathological case can happen when a parameter has been constant the whole run!
    my $start=0;
    for ($start=0; $start < (scalar(@sda)) && $sda[$start]<$mean; $start++) 
      {
      }
    if ($start == $N) {
      $start = $N / 2;
    }

    my $left = $start - 1;
    my $right = $start + 1;
    while (($right-$left + 1) / $N < 0.9) {
#      print STDERR "$left, $start, $right: ", ($right-$left + 1) / $N, ", [", $sda[$left], ', ', $sda[$right], "] mean=$mean\n";      
      if ($left == 0) {
	$right++;
      } elsif ($right >= $N-1) {
	$left--;
      } elsif (abs($sda[$right] - $mean) < abs($sda[$left] - $mean)) {
	$right++;
      } else {
	$left--;
      }
    }
    $bc90 = $sda[$left] . ', ' . $sda[$right];
    while (($right-$left + 1) / $N < 0.95) {
      if ($left == 0) {
	$right++;
      } elsif ($right >= $N-1) {
	$left--;
      } elsif (abs($sda[$right] - $mean) < abs($sda[$left] - $mean)) {
	$right++;
      } else {
	$left--;
      }
    }
    $bc95 = $sda[$left] . ', ' . $sda[$right];
    while (($right-$left + 1) / $N < 0.99) {
      if ($left == 0) {
	$right++;
      } elsif ($right >= $N-1) {
	$left--;
      } elsif (abs($sda[$right] - $mean) < abs($sda[$left] - $mean)) {
	$right++;
      } else {
	$left--;
      }
    }
    $bc99 = $sda[$left] . ', ' . $sda[$right];
  }

  return ($mean, $harmonic_mean, $sdev, $ma, $mi, $bc90, $bc95, $bc99);
}

#
# Analyze a float param
#
sub analyze_a_log {
  my ($burnin, $data) = @_;
  my $N = scalar(keys(%$data)) - $burnin;
  my @ad = ();


  # Mean
  my $sum = 0;
  my $hsum = 0;
  my $harmonic_possible = 0;	# This flag will be turned off if we encounter "0" in data.
  my $start = $burnin + 1;
  for (my $j = $start; $j < $start + $N; $j++) {
    my $dat = $data->{$j};
    if ($j == $start) {
      $sum = $dat;
    } else {
      $sum = addlog($sum, $dat);
    }
    if ($harmonic_possible) {
      if ($dat != 0) {
	if ($j == $start) {
	  $hsum = -$dat;
	}
	else{
	$hsum = addlog($hsum, -$dat);
      }
      } else {
	$harmonic_possible = 0;
      }
    }
    push @ad, $dat;
  }
  my $mean = $sum - log($N);
  my $harmonic_mean = undef;
  if ($harmonic_possible) {
    $harmonic_mean = log($N) - $hsum;
#  }

  # Standard deviation
  $hsum = 0;
  for (my $j = $start; $j < $start + $N; $j++) {
    if($j == $start) {
      $hsum = addlog(-$data->{$j}, $harmonic_mean)*2;
    }
    else {
      $hsum = addlog($hsum,addlog(-$data->{$j}, $harmonic_mean)*2);
    }
  }
  $harmonic_sdev = (0.5*($hsum - log($N - 1))+2*$harmonic_mean)-0.5*log($N-1);
  }
  $sum = 0;
  for (my $j = $start; $j < $start + $N; $j++) {
    $sum += ($data->{$j} - $mean)**2;
  }
  my $sdev = sqrt($sum / ($N - 1));

  # Find extreme points
  my @sda = sort {$a <=> $b} @ad;
  my $n = scalar(@sda);
  my $ma= $sda[$n-1];
  my $mi= $sda[0];

  # Baysian confidence interval, 95% and 99%.
  my $bc90 = 'N/A';
  my $bc95 = 'N/A';
  my $bc99 = 'N/A';
  if ($N >= 100) {		# Don't bother if too little data
    # Find where the mean is "located": Should be binary search here...
    # A pathological case can happen when a parameter has been constant the whole run!
    my $start=0;
    for ($start=0; $start < (scalar(@sda)) && $sda[$start]<$mean; $start++) 
      {
      }
    if ($start == $N) {
      $start = $N / 2;
    }

    my $left = $start - 1;
    my $right = $start + 1;
    while (($right-$left + 1) / $N < 0.9) {
#      print STDERR "$left, $start, $right: ", ($right-$left + 1) / $N, ", [", $sda[$left], ', ', $sda[$right], "] mean=$mean\n";      
      if ($left == 0) {
	$right++;
      } elsif ($right >= $N-1) {
	$left--;
      } elsif (abs($sda[$right] - $mean) < abs($sda[$left] - $mean)) {
	$right++;
      } else {
	$left--;
      }
    }
    $bc90 = $sda[$left] . ', ' . $sda[$right];
    while (($right-$left + 1) / $N < 0.95) {
      if ($left == 0) {
	$right++;
      } elsif ($right >= $N-1) {
	$left--;
      } elsif (abs($sda[$right] - $mean) < abs($sda[$left] - $mean)) {
	$right++;
      } else {
	$left--;
      }
    }
    $bc95 = $sda[$left] . ', ' . $sda[$right];
    while (($right-$left + 1) / $N < 0.99) {
      if ($left == 0) {
	$right++;
      } elsif ($right >= $N-1) {
	$left--;
      } elsif (abs($sda[$right] - $mean) < abs($sda[$left] - $mean)) {
	$right++;
      } else {
	$left--;
      }
    }
    $bc99 = $sda[$left] . ', ' . $sda[$right];
  }

  return ($mean, $sdev, $harmonic_mean, $harmonic_sdev, $ma, $mi, $bc90, $bc95, $bc99);
}

#
# Let gnuplot create a plot in LaTeX' picture mode
#
sub make_plot {
  my ($name, $burnin, $data) = @_;

  my $datafile1 = '/tmp/mcmc_analysis1.dat';
  my $datafile2 = '/tmp/mcmc_analysis2.dat';
  my $plotfile = '/tmp/mcmc_analysis.gp';
  my $texfile = '/tmp/mcmc_analysis.tex';

  open(D1, ">$datafile1") or die "Could not create temporary datafile!";
  open(D2, ">$datafile2") or die "Could not create temporary datafile!";
  open(F, ">$plotfile") or die "Could not create temporary file for gnuplot!";
  print F "set terminal latex
set output \"$texfile\"
set xlabel \"Iteration\"
set key off
set pointsize 0.5
";

  my $start = $burnin + 1;

  my $mi=999999999999999;
  my $ma=-9999999999999999;

  # Points before burnin
  my $N = scalar(keys(%{$data->{$name}})) - $burnin;
  for (my $j = 1; $j <= $start; $j++) {
    my $y = $data->{$name}{$j};
    print D1 $data->{'iteration'}{$j}, "\t", $y, "\n";
    if ($y > $ma) {
      $ma = $y;
    } elsif ($y < $mi) {
      $mi = $y;
      }
  }

  # Points after burnin
  my $reduction = 1;
  my $npoints = $N - $start;
  if ($npoints > $max_n_points) {
    $reduction = int($npoints / $max_n_points);
  }
  for (my $j = $start+1; $j < $start + $N; $j++) {
    my $y = $data->{$name}{$j};
    if ($j % $reduction == 0) {
      print D2 $data->{'iteration'}{$j}, "\t$y\n";
    }
    if ($y > $ma) {
      $ma = $y;
    } elsif ($y < $mi) {
      $mi = $y;
    }
  }
  close(D1);
  close(D2);

  # Give a hint about burnin
  my $x = $data->{'iteration'}{$start};
  print F "set arrow from $x,$mi to $x,$ma nohead\n";

  my $pointstyle = 0;
  if ($show_points) {
    $pointstyle = 10;
  }
  print F "plot \"$datafile1\" with points 0, \"$datafile2\" with linespoints 1 $pointstyle\n";
  close(F);
  system('gnuplot', $plotfile);
  open(T, "<$texfile") or die "No LaTeX file produced by gnuplot!";
  print "\\begin{center}\n";
  while (<T>) {
    print;
  }
  print "\\end{center}\n";
  if ($reduction > 1) {
    print "{\\footnotesize Only 1 out $reduction points are plotted due to the large number of data points ($npoints).}\n";
  }

  # For some reason, it is suitable to break pages after a plot!
  print "\\pagebreak\n";

   unlink $datafile1;
   unlink $datafile2;
#   unlink $plotfile;
#   unlink $texfile;
}


#
# Floats
#
sub analyze_float {
  my ($name, $burnin, $max_sample, $format, $data) = @_;

  my ($mean, $harmonic_mean, $sdev, $ma, $mi, $bc90, $bc95, $bc99) = analyze_a_number($burnin, $data->{$name});
  my $max_param = $data->{$name}{$max_sample};

  if ($format eq 'txt') {
    printf("%s\tMean = %.6g, SD = %.6g,
\tMax = %.6g, min = %.6g
\tAt max posterior probability: %.6g
\tBayesian confidence: 90 %% in [%s],
\t                     95 %% in [%s],
\t                     99 %% in [%s].\n",
	   $name, $mean, $sdev, $ma, $mi, $max_param, $bc90, $bc95, $bc99);
  } else {
    print_tex_number_stats($name, $mean, $sdev, undef, $ma, $mi, $max_param, $bc90, $bc95, $bc99);
    make_plot($name, $actual_burnin, $data);
  }
}

#
# log floats
#
sub analyze_logfloat {
  my ($name, $burnin, $max_sample, $format, $data) = @_;
  my ($mean, $harmonic_mean, $sdev, $ma, $mi, $bc90, $bc95, $bc99) = analyze_a_number($burnin, $data->{$name});
#  my ($mean,  $sdev, $harmonic_mean, $harmonic_sdev, $ma, $mi, $bc90, $bc95, $bc99) = analyze_a_log($burnin, $data->{$name});
  my $max_param = $data->{$name}{$max_sample};

  if ($format eq 'txt') {
#    # Make sure that the troublesome harmonic_mean always is a string.
#    if (defined $harmonic_mean) {
#      $harmonic_mean = sprintf("%.6g", $harmonic_mean);
#    } else {
      $harmonic_mean = 'N/A';
      $harmonic_sdev = 'N/A';
#    }
    printf("%s\tMean = %.6g, SD = %.6g,\n
\tHarmonic Mean = %s, Harmonic SD = %s\n
\tMax = %.6g, min = %.6g
\tAt max posterior probability: %.6g
\tBayesian confidence: 90 %% in [%s],
\t                     95 %% in [%s],
\t                     99 %% in [%s].\n",
	   $name, $mean, $sdev, $harmonic_mean, $harmonic_sdev, $ma, $mi, $max_param, $bc90, $bc95, $bc99);
  } else {
    print_tex_number_stats($name, $mean, $sdev, $harmonic_mean, $ma, $mi, $max_param, $bc90, $bc95, $bc99);
    make_plot($name, $actual_burnin, $data);
  }
}

#
# Integers
#
sub analyze_integer {
  my ($name, $burnin, $max_sample, $format, $data) = @_;
  my ($mean, $harmonic_mean, $sdev, $ma, $mi, $bc90, $bc95, $bc99) = analyze_a_number($burnin, $data->{$name});
  my $max_param = $data->{$name}{$max_sample};

  if ($format eq 'txt') {
    printf("%s\tMean = %.6g, SD = %.6g,
\tMax = %.6g, min = %.6g
\tAt max posterior probability: %.6g
\tBayesian confidence: 90 %% in [%s],
\t                     95 %% in [%s],
\t                     99 %% in [%s].\n",
	   $name, $mean, $sdev, $ma, $mi, $max_param, $bc90, $bc95, $bc99);
  } else {
    print_tex_number_stats($name, $mean, $sdev, undef, $ma, $mi, $max_param, $bc90, $bc95, $bc99);
    make_plot($name, $actual_burnin, $data);
  }
}

#
# Simple trees
#
sub analyze_tree {
  my ($name, $burnin, $max_sample, $format, $data) = @_;

  if ($forget_the_trees) {
    return;
  }

  my %th = ();
  my %tree_num = (); # Identifies in which order the trees are found.
  my $tree_counter = 1;
  my $start = $burnin + 1;
  my $N = scalar(keys(%{$data->{$name}})) - $burnin;

  for (my $j = $start; $j < $start + $N; $j++) {
    my $T = $data->{$name}{$j};
    if (! exists($th{$T})) {
      $tree_num{$T} = $tree_counter++;
    }
    $th{$T}++;
  }


  if ($best_tree_only == 1) {
    foreach my $t (sort {$th{$b} <=> $th{$a}} keys %th) {
      printf("%s\n",  $t);
      printf("%d\n",  int(($th{$t} / $N)*100));
      exit;
    }
    print "\n";
  } elsif ($format eq 'txt') {
    print "$name\tRanked by probability\n";
    print "\tNum\tPostProb\tTree\n";
    foreach my $t (sort {$th{$b} <=> $th{$a}} keys %th) {
      printf("\t%3s\t%.6g\t%s\n", $tree_num{$t}, $th{$t} / $N, $t);
    }
    print "\n";
  } else {			# LaTeX
    my @trees = sort {$th{$b} <=> $th{$a}} keys %th;
    my $n_trees = scalar(@trees);
    my $pname = $name;
    $pname =~ s/_/\\_/g;
    print "\\pagebreak\\section{Trees: $pname}\n";
    print "Number of sampled trees: $n_trees\n";

    # Create a histogram for the tree distribution
    print "\\subsubsection*{The tree posterior distribution}\n";

    if ($n_trees > 100) {
      @trees = @trees[0..99];
      $n_trees = 100;

      print "Here are the 100 most sampled trees charted.\n";
    }

    my $factor=5;
    my $width=$n_trees * $factor;
    my $height=110;
    print "\\begin{picture}($width, $height)(0,0.1)\n";
    for (my $i=0; $i<$n_trees; $i++) {
      my $x = $i*$factor;
      if ($i % 5 == 0) {
	print "\\put($x,0){$i}\n";
      }
      my $len=100 * $th{$trees[$i]} / $N;
      print "\\put($x,10){\\line(0,1){$len}}\n";
    }
    print "\\end{picture}\n";

    if ($n_trees > 10) {
      @trees = @trees[0..9];
      $n_trees = 10;
    }

    for (my $i=1; $i<=$n_trees; $i++) {
      my $fontsize='\normalsize';
      my $n_leaves = ($trees[$i-1] =~ tr/\(/\(/);	# Trick for counting characters of some type

      if ($n_leaves > 30) {
	$fontsize = '\small';
      }
      if ($n_leaves > 45) {
	$fontsize = '\tiny';
      }

      my $num = $tree_num{$trees[$i-1]};
      my $posterior_prob = int(0.5 + $th{$trees[$i-1]} * 100 / $N);
      print "\\subsubsection*{Tree $num, $posterior_prob \\%}
\\begin{newicktree}
$fontsize
\\setunitlength{3em}
\\nodeseparation{1ex}
\\righttree
\\drawtree{
";
      my $tmptree = $trees[$i-1];
      $tmptree =~ s/_/\\_/g;
      print $tmptree, ";}\n\\end{newicktree}\n";
    }
    # It is usually a good idea to break here!
    print "\\pagebreak\n";
  }

}

#
# Orthology
#
sub analyze_orthologypairs {
  my ($name, $burnin, $max_sample, $format, $data) = @_;

  my $start = $burnin + 1;
  my $N = scalar(keys(%{$data->{$name}})) - $burnin;


  my %O = ();
  for (my $j = $start; $j < $start + $N; $j++) {
    my $ortho_str = $data->{$name}{$j};
    my @pairs = split(/(?:(?<!^)\s*(?=\[))/, $ortho_str);
    foreach my $pair (@pairs) {
      $pair =~ m/\[(\S+),(\S+)\]=([-+e0-9\.]+)/;
      $O{"$1, $2"} += $3;
    }
  }

  if ($format eq 'txt') {
    print "$name\tRanked by probability\n";
    foreach my $pair (sort {$O{$b} <=> $O{$a}} keys %O) {
      printf("\t%.6g\t%s\n", $O{$pair} / $N, $pair);
    }
    print "\n";
  } else {			# LaTeX
    my $pname = $name;
    $pname =~ s/_/\\_/g;
    print "\\pagebreak\\section{Orthologous relationships: $pname}\n";
    print "Ranked by probability.



\\setlongtables
\\begin{longtable}{lll}\n";
    foreach my $pair (sort {$O{$b} <=> $O{$a}} keys %O) {
      my @p = split(/,/, $pair);
      $p[0] =~ s/_/\\_/g;
      $p[1] =~ s/_/\\_/g;
      printf("%.6g & %s & %s\\\\ \n",   $O{$pair}/ $N, $p[0], $p[1]);
    }
    print "\\end{longtable}\n\n";
  }
}

#
# reconciliations
#
sub analyze_reconciliation {
  my ($name, $burnin, $max_sample, $format, $data) = @_;

  my %th = ();
  my $start = $burnin + 1;
  my $N = scalar(keys(%{$data->{$name}})) - $burnin;

  for (my $j = $start; $j < $start + $N; $j++) {
    $th{$data->{$name}{$j}}++;
  }

  if ($format eq 'txt') {
    print "$name\tRanked by probability\n";
    foreach my $t (sort {$th{$b} <=> $th{$a}} keys %th) {
      printf("\t%.6g\t%s\n", $th{$t} / $N, $t);
    }
    print "\n";
  }
}



#
#  Prologue
#
sub print_prologue {
  my ($name, $flist, $cmd, $n, $burnin) = @_;

  my $fname = join(', ', @$flist);
  if ($format eq 'txt') {
    if (defined $name) {
      print "Job name: $name\n";
    }
    print "Command: $cmd\n";
    print "$n datapoints from $fname.\n";
    print "Burnin is $burnin samples.\n";
  } elsif ($format eq 'latex') {
    print "\\documentclass[a4paper]{article}
\\usepackage{newicktree}
\\usepackage{longtable}

\\begin{document}
";
    if (defined $name) {
      print "\\title{$name}
\\date{}
\\author{}
\\maketitle
";
}

    print "\\section{Summary}

Number of samples: $n\\\\
Files: \\verb+$fname+\\\\
Command:
\\begin{verbatim}
$cmd
\\end{verbatim}

\\noindent Burnin is $burnin samples.

";
  } else {
    die "oops, a programming problem!";
  }
}

#
#  Ending
#
sub print_end {
  if ($format eq 'latex') {
    print "\\end{document}\n";
  }
}

#
#
#
sub print_tex_number_stats {
  my ($name, $mean, $sdev, $harmonic_mean, $ma, $mi, $max_param, $bc90, $bc95, $bc99) = @_;


  my $tmpname = $name;
  $tmpname =~ s/_/\\_/g;
  my $format = "\\section{Parameter: %s}\nMean = %.6g StdDev = %.6g ";
  if (defined $harmonic_mean) {
    $format .= "Harmonic mean = %.6g";
  }
  $format .= "\\\\
Max = %.6g, min = %.6g\\\\
At max posterior probability: %.6g
\\subsubsection*{Bayesian confidence}
\\begin{tabular}{ll}
Level & Interval\\\\ \\hline
 90 \\%% & [%s]\\\\
 95 \\%% & [%s]\\\\
 99 \\%% & [%s] \\\\ \\hline
\\end{tabular}
";
  if (defined $harmonic_mean) {
    printf($format,
	   $tmpname, $mean, $sdev, $harmonic_mean, $ma, $mi, $max_param, $bc90, $bc95, $bc99);
  } else {
    printf($format,
	   $tmpname, $mean, $sdev, $ma, $mi,  $max_param, $bc90, $bc95, $bc99);
  }
}


sub get_actual_burnin {
  my $burnin = shift @_;
  my $sample_no = shift @_;


  if ($burnin < 1) {
    return floor($burnin * $sample_no);
  } else {
    if ($burnin < $sample_no - 2) {
      return $burnin;
    } else {
      print STDERR "Too few data points or too large burnin ($burnin).\n";
      exit 2;
    }
  }
}

# This perform the addition e^p + e^q
sub addlog {
  my $p = shift @_;
  my $q = shift @_;

  if ($p > $q) {
    $p = $p + log1p(exp($q - $p));
  } else {
    $p = $q + log1p(exp($p - $q));
  }
  return $p;
}
prime-phylo 1.0.11-4+b3 / usr / bin / mcmc_analysis