/usr/bin/csv2mipe

#!/usr/bin/perl

#    This library is free software; you can redistribute it and/or
#    modify it under the terms of the GNU Lesser General Public
#    License as published by the Free Software Foundation; either
#    version 2.1 of the License, or (at your option) any later version.
#
#    This library is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#    Lesser General Public License for more details.
#
#    You should have received a copy of the GNU Lesser General Public
#    License along with this library ('COPYING'); if not, write to the Free Software
#    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

use strict;
use warnings;
use XML::Twig;

=head1 NAME

csv2mipe.pl - Generates MIPE file based on 3 tab-delimited files
  based on MIPE version v1.1
  arguments: * tab-delimited file with PCR-level data
             * tab-delimited file with SNP-level data
             * tab-delimited file with assay-level data

  Columns in file with PCR-level data:
    pcr_id
    pcr_modified (might be multiple, divided by semi-colon ";")
    pcr_project (might be multiple, divided by semi-colon ";")
    pcr_researcher (might be multiple, divided by semi-colon ";")
    pcr_species
    source_type
    source_id
    design_seq
    primer1_oligo
    primer1_seq
    primer1_tm
    primer2_oligo
    primer2_seq
    primer2_tm
    design_remark (might be multiple, divided by semi-colon ";")
    use_seq
    use_revcomp
    use_remark (might be multiple, divided by semi-colon ";")
    pcr_remark (might be multiple, divided by semi-colon ";")

  Columns in file with SNP-level data:
    pcr_id
    snp_id
    snp_pos
    snp_amb
    snp_remark (might be multiple, divided by semi-colon ";")

  Columns in file with assay-level data:
    pcr_id
    snp_id
    assay_id
    assay_type
    assay_enzyme
    assay_oligo
    assay_specific
    assay_tail
    assay_strand
    assay_remark (might be multiple, divided by semi-colon ";")

=head1 SYNOPSIS

csv2mipe.pl <pcr_file.csv> <snp_file.csv> <assay_file.csv>

=head1 ADDITIONAL INFO

See http://mipe.sourceforge.net

=head1 AUTHOR

Jan Aerts (jan.aerts@bbsrc.ac.uk)

=cut

my ( $pcr_file, $snp_file, $assay_file ) = @ARGV;

#if ( not scalar @ARGV == 3 ) { die "Usage: csv2mipe.pl <pcr_file.csv> <snp_file.csv> <assay_file.csv>\n" };

### Read PCR data
open PCR, $pcr_file or die "Cannot open $pcr_file\n";
chomp ( my @pcr_data = ( <PCR> ) );
close PCR;

my %pcr_data;
foreach ( @pcr_data ) {
  if ( scalar (split /\t/, $_) != 19 ) { die "Wrong number of fields in the following line of $pcr_file:\n$_\n" };
  my ( $pcr_id, $pcr_modified, $pcr_project, $pcr_researcher, $pcr_species, $source_type, $source_id, $design_seq, $primer1_oligo, $primer1_seq, $primer1_tm, $primer2_oligo, $primer2_seq, $primer2_tm, $design_remark, $use_seq, $use_revcomp, $use_remark, $pcr_remark ) = split /\t/, $_;
  $pcr_data{$pcr_id}{pcr_modified} = $pcr_modified;
  $pcr_data{$pcr_id}{pcr_project} = $pcr_project;
  $pcr_data{$pcr_id}{pcr_researcher} = $pcr_researcher;
  $pcr_data{$pcr_id}{pcr_species} = $pcr_species;
  $pcr_data{$pcr_id}{source_type} = $source_type;
  $pcr_data{$pcr_id}{source_id} = $source_id;
  $pcr_data{$pcr_id}{design_seq} = $design_seq;
  $pcr_data{$pcr_id}{primer1_oligo} = $primer1_oligo;
  $pcr_data{$pcr_id}{primer1_seq} = $primer1_seq;
  $pcr_data{$pcr_id}{primer1_tm} = $primer1_tm;
  $pcr_data{$pcr_id}{primer2_oligo} = $primer2_oligo;
  $pcr_data{$pcr_id}{primer2_seq} = $primer2_seq;
  $pcr_data{$pcr_id}{primer2_tm} = $primer2_tm;
  $pcr_data{$pcr_id}{design_remark} = $design_remark;
  $pcr_data{$pcr_id}{use_seq} = $use_seq;
  $pcr_data{$pcr_id}{use_revcomp} = $use_revcomp;
  $pcr_data{$pcr_id}{use_remark} = $use_remark;
  $pcr_data{$pcr_id}{pcr_remark} = $pcr_remark;
}

### Read SNP data
open SNP, $snp_file or die "Cannot open $snp_file\n";
chomp ( my @snp_data = ( <SNP> ) );
close SNP;

my %snp_data;
foreach ( @snp_data ) {
  if ( scalar (split /\t/, $_) != 5 ) { die "Wrong number of fields in the following line of $snp_file:\n$_\n" };
  my ( $pcr_id, $snp_id, $snp_pos, $snp_amb, $snp_remark ) = split /\t/, $_;
  $snp_data{$pcr_id}{$snp_id}{snp_pos} = $snp_pos;
  $snp_data{$pcr_id}{$snp_id}{snp_amb} = $snp_amb;
  $snp_data{$pcr_id}{$snp_id}{snp_remark} = $snp_remark;
}

### Read assay data
open ASSAY, $assay_file or die "Cannot open $assay_file\n";
chomp ( my @assay_data = ( <ASSAY> ) );
close ASSAY;

my %assay_data;
foreach ( @assay_data ) {
  if ( scalar (split /\t/, $_) != 10 ) { die "Wrong number of fields in the following line of $assay_file:\n$_\n" };
  my ( $pcr_id, $snp_id, $assay_type, $assay_id, $assay_enzyme, $assay_oligo, $assay_specific, $assay_tail, $assay_strand, $assay_remark ) = split /\t/, $_;
  $assay_data{$pcr_id}{$snp_id}{$assay_id}{assay_type} = $assay_type;
  $assay_data{$pcr_id}{$snp_id}{$assay_id}{assay_enzyme} = $assay_enzyme;
  $assay_data{$pcr_id}{$snp_id}{$assay_id}{assay_oligo} = $assay_oligo;
  $assay_data{$pcr_id}{$snp_id}{$assay_id}{assay_specific} = $assay_specific;
  $assay_data{$pcr_id}{$snp_id}{$assay_id}{assay_tail} = $assay_tail;
  $assay_data{$pcr_id}{$snp_id}{$assay_id}{assay_strand} = $assay_strand;
  $assay_data{$pcr_id}{$snp_id}{$assay_id}{assay_remark} = $assay_remark;
}

### Print everything
print "<?xml version='1.0'?>\n";
print "<mipe xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 xsi:noNamespaceSchemaLocation="file:/home/aerts001/Documents/tmp/mipe_new.xsd">\n";
print "  <version>1.0</version>\n";
foreach my $pcr_id ( sort keys %pcr_data ) {
  print "  <pcr id=\"$pcr_id\">\n";
  print "    <id>", $pcr_id, "</id>\n";
  foreach ( split /;/, $pcr_data{$pcr_id}{pcr_modified} ) {
    print "    <modified>", $_, "</modified>\n";
  }
  foreach ( split /;/, $pcr_data{$pcr_id}{pcr_project} ) {
    print "    <project>", $_, "</project>\n";
  }
  foreach ( split /;/, $pcr_data{$pcr_id}{pcr_researcher} ) {
    print "    <researcher>", $_, "</researcher>\n";
  }
  print "    <species>", $pcr_data{$pcr_id}{pcr_species}, "</species>\n";
  print "    <design>\n";
  print "      <source>\n";
  print "        <", $pcr_data{$pcr_id}{source_type}, ">", $pcr_data{$pcr_id}{source_id}, "</", $pcr_data{$pcr_id}{source_type}, ">\n";
  print "      </source>\n";
  print "      <seq>", $pcr_data{$pcr_id}{design_seq}, "</seq>\n";
  print "      <primer1>\n";
  print "        <oligo>", $pcr_data{$pcr_id}{primer1_oligo}, "</oligo>\n";
  print "        <seq>", $pcr_data{$pcr_id}{primer1_seq}, "</seq>\n";
  print "        <tm>", $pcr_data{$pcr_id}{primer1_tm}, "</tm>\n";  
  print "      </primer1>\n";
  print "      <primer2>\n";
  print "        <oligo>", $pcr_data{$pcr_id}{primer2_oligo}, "</oligo>\n";
  print "        <seq>", $pcr_data{$pcr_id}{primer2_seq}, "</seq>\n";
  print "        <tm>", $pcr_data{$pcr_id}{primer2_tm}, "</tm>\n";  
  print "      </primer2>\n";
  foreach ( split /;/, $pcr_data{$pcr_id}{design_remark} ) {
    print "      <remark>", $_, "</remark>\n";
  }
  print "    </design>\n";
  print "    <use>\n";
  print "      <seq>", $pcr_data{$pcr_id}{use_seq}, "</seq>\n";
  print "      <revcomp>", $pcr_data{$pcr_id}{use_revcomp}, "</revcomp>\n";

  foreach my $snp_id ( sort keys %{$snp_data{$pcr_id}} ) {
    print "      <snp id=\"$snp_id\">\n";
    print "        <id>", $snp_id, "</id>\n";
    print "        <pos>", $snp_data{$pcr_id}{$snp_id}{snp_pos}, "</pos>\n";
    print "        <amb>", $snp_data{$pcr_id}{$snp_id}{snp_amb}, "</amb>\n";
    foreach ( split /;/, $snp_data{$pcr_id}{$snp_id}{snp_remark} ) {
      print "        <remark>", $_, "</remark>\n";
    }

    foreach my $assay_id ( sort keys %{$assay_data{$pcr_id}{$snp_id}} ) {
      print "        <assay id=\"$assay_id\">\n";
      print "          <type>", uc $assay_data{$pcr_id}{$snp_id}{$assay_id}{assay_type}, "</type>\n";
      print "          <id>", $assay_id, "</id>\n";
      if ( uc $assay_data{$pcr_id}{$snp_id}{$assay_id}{assay_type} eq 'SBE' ) {
        print "          <oligo>", $assay_data{$pcr_id}{$snp_id}{$assay_id}{assay_oligo}, "</oligo>\n";
        print "          <specific>", $assay_data{$pcr_id}{$snp_id}{$assay_id}{assay_specific}, "</specific>\n";
        print "          <tail>", $assay_data{$pcr_id}{$snp_id}{$assay_id}{assay_tail}, "</tail>\n";
        print "          <strand>", $assay_data{$pcr_id}{$snp_id}{$assay_id}{assay_strand}, "</strand>\n";
      } else {
        print "          <enzyme>", $assay_data{$pcr_id}{$snp_id}{$assay_id}{assay_enzyme}, "</enzyme>\n";
      }
      foreach ( split /;/, $assay_data{$pcr_id}{$snp_id}{$assay_id}{assay_remark} ) {
        print "          <remark>", $_, "</remark>\n";
      }

      print "        </assay>\n";
    }
    
    print "      </snp>\n";
  }  

  foreach ( split /;/, $pcr_data{$pcr_id}{use_remark} ) {
    print "      <remark>", $_, "</remark>\n";
  }
  print "    </use>\n";
  foreach ( split /;/, $pcr_data{$pcr_id}{pcr_remark} ) {
    print "    <remark>", $_, "</remark>\n";
  }
  
  print "  </pcr>\n";
}



print "</mipe>\n";
mipe 1.1-4 / usr / bin / csv2mipe