/usr/bin/med2ris

#!/usr/bin/perl

## med2ris: converts Pubmed/Medline datasets to RIS format
##             auto-detects tagged and XML data formats
##
## usage: perl med2ris < infile.med > outfile.ris
##
## Dependencies: perl 5.0.0 or later
##               XML::Parser
##               RefDB::Pubmed
##               RefDB::CGI
##               RefDB::Pref
##               RefDB::Log
##               Text::Iconv
##
## markus@mhoenicka.de 2002-12-02

##   This program is free software; you can redistribute it and/or modify
##   it under the terms of the GNU General Public License as published by
##   the Free Software Foundation; either version 2 of the License, or
##   (at your option) any later version.
##   
##   This program is distributed in the hope that it will be useful,
##   but WITHOUT ANY WARRANTY; without even the implied warranty of
##   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##   GNU General Public License for more details.
   
##   You should have received a copy of the GNU General Public License
##   along with this program; if not, see <http://www.gnu.org/licenses/>


## define a lightweight class to hold the element info for XML data
package Elinfo;

sub new {
    my $class = shift;
    my $self = {};

    ## the name of the element
    $self->{ELNAME} = undef;

    ## the character data contained in the element, if any
    $self->{ELVALUE} = undef;

    ## a list of attribute key/value pairs, if any
    $self->{ATTLIST} = [];
    
    bless $self, $class;
    return $self;
}

## here the code proper starts
package main;

## the interface module for expat, required to read Pubmed XML data
use XML::Parser;

## the common RefDB modules
use RefDB::CGI;
use RefDB::Log;
use RefDB::Prefs;

## the RefDB module for tagged Pubmed data
use RefDB::Pubmed;

## use this module to read command line options
use Getopt::Std;

## this one is for syslog (who'd have guessed)
use Sys::Syslog;

## character encoding conversion
use Text::Iconv;

## this is for the config file stuff
my $prefs;

## whether or not we run as cgi
my $is_cgi = 0;

## the html fragments for the cgi output will go here
my $refdblib = "/usr/share/refdb";
my $cgi_head;
my $cgi_foot;


## read config file settings
my $confdir = "/etc/refdb";
my $read_prefs = 1;
my $next;

## look for -q and -y options
foreach $arg (@ARGV) {
#    my $next;
    if ($next) {
	$confdir = $arg;
	$next = 0;
    }
    elsif ($arg eq "-y") {
	$next = 1;
    }
    elsif ($arg eq "-q") {
	$read_prefs = 0;
    }
}

## read config files
if ($read_prefs) {
    if (RefDB::CGI::check_cgi("GET") == 0) {
	$prefs = RefDB::Prefs::->new("$confdir/med2riscgirc", undef);
	$is_cgi = 1;
    }
    else {
	my $home = $ENV{'HOME'};
	$prefs = RefDB::Prefs::->new("$confdir/med2risrc", "$home/med2risrc");
    }
}

#### variables to hold config options. Will be initialized by whatever
#### was in the config files

## name of the output file, if any. If undef, send data to stdout
my $outfile = $prefs->{"outfile"};

## if f, overwrite; if t, append
my $append = (defined($prefs->{"outappend"})) ? $prefs->{"outappend"} : "f";

## type is either xml or tag
my $type = $prefs->{"type"}; ## leave undef if no default type specified

## whether or not to print info about unknown/unused tags
my $unmapped = (defined($prefs->{"unmapped"})) ? $prefs->{"unmapped"} : "f";

## location of shared data ($refdblib was initialized previously)
my $refdblib = (defined($prefs->{"refdblib"})) ? $prefs->{"refdblib"} : $refdblib;

## character encodings
my $from_enc = (defined($prefs->{"from_enc"})) ? $prefs->{"from_enc"} : "ASCII";
my $to_enc = (defined($prefs->{"to_enc"})) ? $prefs->{"to_enc"} : "UTF-8";

## logging options
my $logfile = (defined($prefs->{"logfile"})) ? $prefs->{"logfile"} : "/var/log/med2ris.log";
my $loglevel = (defined($prefs->{"loglevel"})) ? $prefs->{"loglevel"} : 6;
my $logdest = (defined($prefs->{"logdest"})) ? $prefs->{"logdest"} : 2; ## 0 = stderr, 1 = syslog, 2 = file

## this hash will receive the command line options
my %opts;

## the switches are:
## -e dest: log destination
## -f enc: input data character encoding
## -h: prints help
## -i: print additional info about unused/unknown tags
## -l level: log level
## -L file: log file
## -o/-O file: specifies output file for writing/appending
## -q: ignore config file
## -t enc: output data character encoding
## -T type: override automatic type detection
## -y path: set confdir
getopts('e:f:hil:L:o:O:qt:T:y:', \%opts);

## loop over all command line options
while (($key, $value) = each %opts) {
    if ($key eq "e") {
	$logdest = $value;
    }
    elsif ($key eq "f") {
	$from_enc = $value;
    }
    elsif ($key eq "h") {
	print "med2ris converts tagged or XML Pubmed datasets into RIS datasets\n";
	print "Usage: [perl] med2ris [-e dest] [-f enc] [-h] [-i] [-l level] [-L logfile] [(-o|-O) outfile] [-q] [-t enc] [-T type] [-y path] {filename...}\n       [perl] med2ris [-h] [(-o|-O) outfile]\nThe first invocation reads Pubmed datasets from the specified input files. The second invocation accepts Pubmed datasets at stdin. Output is sent to stdout unless one of the -o/-O options is used\nOptions: -e dest     log destination (stderr|syslog|file)\n         -f enc      set RIS input data character encoding\n         -h          print this help and exit\n         -i          print additional tag info\n         -l loglevel set log level (0-7)\n         -L logfile  path of custom log file\n         -o outfile  send output to outfile (overwrite)\n         -O outfile  send output to outfile (append)\n         -q          ignore config file\n         -t enc      set output data character encoding\n         -T type     override automatic type detection (xml|tag)\n         -y path     set custom config file path\n";
	exit(0);
    }
    elsif ($key eq "i") {
	$unmapped = "t";
    }
    elsif ($key eq "l") {
	$loglevel = $value;
    }
    elsif ($key eq "L") {
	$logfile = $value;
    }
    elsif ($key eq "o") {
	$outfile = $value;
    }
    elsif ($key eq "O") {
	$outfile = $value;
	$append = t;
    }
    elsif ($key eq "q") {
	## do nothing, -q was used before getopts
    }
    elsif ($key eq "t") {
	$to_enc = $value;
    }
    elsif ($key eq "T") {
	$type = ($value eq "tag") ? "tag" : "xml";
    }
    elsif ($key eq "y") {
	## do nothing, -y was used before getopts
    }
}

## post-process a few variables
$logdest = RefDB::Log::num_logdest($logdest);
$loglevel = RefDB::Log::num_loglevel($loglevel);

if (defined($type)) {
    $type = ($type eq "tag") ? $type : "xml";
}
## else: leave undefined

## if we're supposed to write to an output file, try to open it
if (length($outfile) > 0 && $is_cgi == 0) {
    ## try to open the output file
    if ($append eq "t") {
	open OUT, ">>$outfile" or die "cannot open output file for appending: $outfile\n";
    }
    else {
	open OUT, ">$outfile" or die "cannot open output file for overwriting: $outfile\n";
    }

    ## make all print commands send output to this handle
    select OUT;
}

## set up logging
my $log = RefDB::Log::->new($logdest, $loglevel, $logfile, "med2ris");

## we may have to deal with both tagged and XML data unless the type was
## specified on the command line. Load the parsers appropriately
my $parser;
my $pm;

my $converter;

if (!defined($type) || $type ne "tag") {
    ## initialize XML parser
    $parser = new XML::Parser();

    $parser->setHandlers(Start => \&Start_handler,
			 End => \&End_handler,
			 Char => \&Char_handler,
			 Default => \&Default_handler );

    # expat always returns UTF-8 data
    $converter =  Text::Iconv->new("UTF-8", $to_enc);
}

if (!defined($type) || $type ne "xml") {
    ## initialize tagged Pubmed parser
    $pm = new RefDB::Pubmed;
    $pm->set_encodings($from_enc, $to_enc);
    $pm->set_print_unmapped($unmapped);
}

## @ARGV has the remaining arguments if the user specified input
## files. Use stdin if there's no input file argument
if (@ARGV == 0) {
    push @ARGV, "-";
}

## we'll push/pop XML elements on/off this list as they arrive
my @elstack;

## these hashes will receive the current values of pubdate, author,
## chemical, meshheading XML elements
my %currpubdate;
my %currauthor;
my %currchemical;
my %currmesh;

## this string will receive all RIS field except the TY field so the
## output can be dumped in the correct order
my $alloutstring;

## this string will receive the TY field
my $tystring;

## this string collects the abstract chunks. Around 2011, Pubmed
## started to use several <AbstractText> elements for Background,
## Methods and so on
my $n2string;

## this hash helps to convert month names to numbers
my %monthnames = (
		  "Jan" => "01",
		  "Feb" => "02",
		  "Mar" => "03",
		  "Apr" => "04",
		  "May" => "05",
		  "Jun" => "06",
		  "Jul" => "07",
		  "Aug" => "08",
		  "Sep" => "09",
		  "Oct" => "10",
		  "Nov" => "11",
		  "Dec" => "12");

## CGI and regular usage require different data handling
if ($is_cgi) {
    ## load html templates
    $cgi_head = RefDB::CGI::load_html("$refdblib/templates/refdbadd_head.html");
    $cgi_foot = RefDB::CGI::load_html("$refdblib/templates/refdbadd_foot.html");

    ## read CGI data into a string and split into name/value pairs
    read (STDIN, my $input_buffer, $ENV{'CONTENT_LENGTH'});
    my @namevals = split /&/, $input_buffer;

    ## print content type and head template
    print "Content-type: text/html\n\n";
    print $cgi_head;

    ## loop over all name/value pairs
    foreach $item (@namevals) {
	## split name and value
	my ($name, $value) = split /=/, $item;

	## some black magic to decode the POST data string
	$value =~ tr/+/ /;
	$value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;

	## addref is the only data field of interest
	if ($name eq "addref") {
	    if ($pm->string($value)) {
		## apparently tagged data
		$log->log_print("debug", "converting tagged data (cgi)");
		while ((my $set = $pm->next_pubmed_set())) {
		    $set->parse_pmset();
		    
		    $set->convert_pmset();
		    $set->dump_pmset_as_ris();
		}
	    }
	    else {
		## apparently XML data
		$log->log_print("debug", "converting XML data (cgi)");
		$parser->parse($value);
	    }
	}
    }

    ## print footer template
    print $cgi_foot;
}
else { ## not CGI
    ## loop over all filename arguments
    foreach $arg (@ARGV) {
	## try to determine data type unless type was specified
	if (!defined($type)) {
	    if (defined($pm->in($arg))) {
		## apparently tagged data
		$log->log_print("debug", "converting tagged data in $arg");
		while ((my $set = $pm->next_pubmed_set())) {
		    $set->parse_pmset();
		    
		    $set->convert_pmset();
		    $set->dump_pmset_as_ris();
		}
	    }
	    else {
		## apparently XML data
		$log->log_print("debug", "converting XML data in $arg");
		$parser->parsefile($arg);
	    }
	}
	elsif ($type eq "xml") {
	    ## the parser calls our handlers whenever it encounters a start tag,
	    ## character data, or an end tag. All data processing is done in
	    ## these handlers
	    $log->log_print("debug", "converting XML data in $arg");
	    $parser->parsefile($arg);
	}
	else { ## tagged data
	    $log->log_print("debug", "converting tagged data in $arg");
	    $pm->in($arg);
	    while ((my $set = $pm->next_pubmed_set())) {
		$set->parse_pmset();
		
		$set->convert_pmset();
		$set->dump_pmset_as_ris();
	    }
	}
    } ## end foreach
}

## done processing all input
$log->close();


######################################################################
## XML parser handlers                                              ##
######################################################################

##********************************************************************
## Start_handler(): this handler is called once for each start tag and
##                  once for each empty element
## Arguments: pointer to "global" parser data (unused)
##            element name
##            list of attribute name/value pairs
## Return value is ignored
##********************************************************************
sub Start_handler {
    my($p, $el, @keyvals) = @_;
    my $numvals = @keyvals;

    ## create a new Elinfo object for the current element
    my $elinf = new Elinfo;

    ## fill in element name and attributes, if any
    $elinf->{ELNAME} = $el;
    @{$elinf->{KEYVALS}} = @keyvals;

    # push the new object on our stack
    push @elstack, $elinf;

} ## end of Start_handler

##********************************************************************
## End_handler(): this handler is called once for each end tag and 
##                once for each empty element
## Arguments: pointer to "global" parser data (unused)
##            element name (unused)
## Return value is ignored
##********************************************************************
sub End_handler {
    my($p, $el) = @_;

    ## pop the current element off the stack
    my $currel = pop @elstack;
    my $elname = $currel->{ELNAME};
    my $elvalue = $currel->{ELVALUE};
    my $IdType = undef; # attribute of the ArticleId element
    my $N2Label = undef; # label of an extende abstract

    ## retrieve the attributes that make a difference
    while (my $key = shift(@{$currel->{KEYVALS}})) {
	my $val = shift(@{$currel->{KEYVALS}});
	if ($key eq "IdType") {
	    $IdType = $val;
	}
	elsif ($key eq "Label") {
	    $n2label = $val . " ";
	}
    }

    ## check available element data and append an appropriate string
    ## to our pool strings
    if ($elname eq "ISSN" && length($elvalue) > 0) {
	$alloutstring .= "SN  - $elvalue\n";
    }
    elsif ($elname eq "ArticleId"
	   && length($elvalue) > 0
	   && $IdType eq "doi") {
	$alloutstring .= "M3  - $elvalue\n";
    } 
    elsif ($elname eq "Volume" && length($elvalue) > 0) {
	$alloutstring .= "VL  - $elvalue\n";
    }
    elsif ($elname eq "Issue" && length($elvalue) > 0) {
	$alloutstring .= "IS  - $elvalue\n";
    }
    elsif ($elname eq "PubDate") {
	my $year = $currpubdate{"Year"};

	## the month is supplied as a three letter abbreviated name
	my $month = $monthnames{$currpubdate{"Month"}};

	## expand day to two digits
	my $day = $currpubdate{"Day"};

	if (length($currpubdate{"Day"}) == 1) {
	    $alloutstring .= "PY  - $year/$month/0$day/\n";
	}
	elsif (length($currpubdate{"Day"}) == 2) {
	    $alloutstring .= "PY  - $year/$month/$day/\n";
	}
	else {
	    $alloutstring .= "PY  - $year/$month//\n";
	}

	## reset pool
	%currpubdate = ();
    }
    elsif ($elname eq "DateCreated"
	   || $elname eq "DateCompleted"
	   || $elname eq "DateRevised"
	   || $elname eq "PubMedPubDate") {

	## these dates are currently not evaluated. However, as they may contain year, month, and day elements, we have to reset the pool
	%currpubdate = ();
    }
    elsif ($elname eq "ArticleTitle" && length($elvalue) > 0) {
	$alloutstring .= "TI  - $elvalue\n";
    }
    elsif ($elname eq "MedlinePgn" && length($elvalue) > 0) {
	my $startpage = $elvalue;
	my $endpage = $elvalue;

	## try to split the string into start and end page
	## everything before a dash or the whole string if no dash
	## is present goes into $startpage
	## everything after a dash or the whole string if no dash
	## is present goes into $endpage
	$startpage =~ s/-.*//;
	$endpage =~ s/.*-//;

	if (length($startpage) > 0) {
	    $alloutstring .= "SP  - $startpage\n";
	}

	## print endpage only if it differs from startpage
	if (length($endpage) > 0 && $startpage ne $endpage) {
	    $alloutstring .= "EP  - $endpage\n";
	}
    }
    elsif ($elname eq "AbstractText" && length($elvalue) > 0) {
	if (length($n2string) > 0) {
	    $n2string .= "$n2label" . "$elvalue\n";
	}
	else {
	    $n2string .= "N2  - $n2label" . "$elvalue\n";
	}
    }
    elsif ($elname eq "Affiliation" && length($elvalue) > 0) {
	$alloutstring .= "AD  - $elvalue\n";
    }
    elsif ($elname eq "PersonalNameSubject") {
	## reset the pool. We don't transform this element but currauthor
	## contains the name parts anyway
	%currauthor = ();
    }
    elsif ($elname eq "Author") {
	my $firstname = $currauthor{"FirstName"};
	my $forename = $currauthor{"ForeName"};
	my $middlename = $currauthor{"MiddleName"};
	my $lastname = $currauthor{"LastName"};
	my $initials = $currauthor{"Initials"};
	my $suffix = $currauthor{"Suffix"};
	my $collectivename = $currauthor{"CollectiveName"};
	my $rawname;

	if (defined ($collectivename)) {
	    $rawname = $collectivename;
	}
	elsif (defined ($forename)) {
	    ## append a period if the last char is an initial
	    $forename =~ s%([A-Z])$%$1.%g;

	    ## replace space following initial with period
	    $forename =~ s%([A-Z]) +%$1.%g;

	    $rawname = $lastname . "," . $forename . "," . $suffix;
	}
	elsif (length($middlename) > 0) {
	    $rawname = $lastname . "," . $firstname . " " . $middlename . "," . $suffix;
	}
	else {
	    $rawname = $lastname . "," . $firstname . "," . $suffix;
	}

	## remove spaces after separators
	$rawname =~ s%([,.]+) *%$1%g;

	## remove trailing separators after spaces
	$rawname =~ s% *[,;:/]*$%%;

	## remove periods after a non-abbreviated name
	$rawname =~ s%(\w{2,})\.%$1%g;

	## output the result
	$alloutstring .= "AU  - $rawname\n";

	## reset the pool
	%currauthor = ();
    }
    elsif ($elname eq "PublicationType") {
	## ToDo: this should be improved. We currently map everything
	## to a journal article because RIS doesn't really give us
	## much of a choice. The real type is encoded as a keyword
	$tystring = "\nTY  - JOUR\n";
	$alloutstring .= "KW  - $elvalue\n";
    }
    elsif ($elname eq "MedlineTA" && length($elvalue) > 0) {
	$alloutstring .= "JO  - $elvalue\n";
    }
    elsif ($elname eq "Chemical") {
	my $regnum = $currchemical{"RegistryNumber"};
	my $chemname = $currchemical{"NameOfSubstance"};

	if (length($regnum) > 0) {
	    if (length($chemname) > 0) {
		$alloutstring .= "KW  - $regnum ($chemname)\n";
	    }
	    else {
		$alloutstring .= "KW  - $regnum\n";
	    }
	}
	elsif (length($chemname) > 0) {
	    $alloutstring .= "KW  - $chemname\n";
	}

	## reset the pool
	%currchemical = ();
    }
    elsif ($elname eq "MeshHeading") {
	my $descr = $currmesh{"DescriptorName"};
	my $qual = $currmesh{"QualifierName"};

	## the parser does funny things with entities like &#38 so
	## we have to fix the $qual string lest we get things like
	## tyrosine [analogs][&][ derivatives]
	$qual =~ s/\]\[&\]\[/ &/g;

	## we can safely assume that we'll never see a qualifier without
	## descriptor
	if (length($qual) > 0) {
	    ## now we split the qualifier string into single qualifiers
	    my @tokens = split m%\[%, $qual;
	
	    foreach $item (@tokens) {
		## the first item will be whitespace, ignore
		$item =~ s/^\s//;
		if (length($item)> 0) {
		    $alloutstring .= "KW  - $descr [$item\n";
		}
	    }
	}
	elsif (length($descr) > 0) {
	    $alloutstring .= "KW  - $descr\n";
	}

	## reset the pool
	%currmesh = ();
    }
    elsif ($elname eq "PMID") {
	## each Medline article has a PMID, but this element is also
	## used to crosslink to corrections, comments and such. We
	## only want the article's own PMID, so we have to peek at the
	## top of the stack and make sure the current PMID element is
	## a child of a MedlineCitation element
	my $currel = $elstack[-1];
	my $currel_elname = $currel->{ELNAME};
	if ($currel_elname eq "MedlineCitation") {
	    $alloutstring .= "L2  - http://www.ncbi.nlm.nih.gov/pubmed/$elvalue\n";
	}
    }
    elsif ($elname eq "PubmedArticle") {
	## the article is done so dump all available information
	print $tystring,$alloutstring,$n2string,"ER  - \n";

	## reset our pool strings
	$alloutstring = "";
	$tystring = "";
	$n2string = "";
	$n2label = "";
    }

} ## end of End_handler

##********************************************************************
## Char_handler(): this handler is called at least once for character
##                 data within an element
## Arguments: pointer to "global" parser data (unused)
##            character data
## Return value is ignored
##********************************************************************
sub Char_handler {
    my($p, $string) = @_;

    ## remove trailing whitespace
    $string =~ s/\s*$//g;

    ## character encoding changes here
    $string = $converter->convert($string);

    ## set the value of the current top element of the stack. As the parser
    ## may legally provide the character data of one element in more than
    ## one call to this handler we have to append the current $string to
    ## whatever is already there
    my $currel = $elstack[-1];
    $currel->{ELVALUE} = $currel->{ELVALUE} . $string;
    my $elname = $currel->{ELNAME};

    ## some RIS fields are composed of several elements. We pool the
    ## data here and dump them as soon as the enclosing element ends
    if ($elname eq "Year" || $elname eq "Month" || $elname eq "Day") {
	$currpubdate{$elname} = $currel->{ELVALUE};
    }
    elsif ($elname eq "MedlineDate") {
	## apparently common in older citations, contains plain text date
	$currpubdate{"Year"} = $currel->{ELVALUE};
	## make no attempt to extract anything but a 4-digit year
	$currpubdate{"Year"} =~ s/.*(\d{4,}).*/$1/;
	$currpubdate{"Month"} = "";
	$currpubdate{"Day"} = "";
    }
    elsif ($elname eq "FirstName"
	   || $elname eq "ForeName"
	   || $elname eq "MiddleName"
	   || $elname eq "LastName"
	   || $elname eq "Suffix"
	   || $elname eq "CollectiveName"
	   || $elname eq "Initials") {
	$currauthor{$elname} .= $currel->{ELVALUE};
    }
    elsif ($elname eq "RegistryNumber"
	    || $elname eq "NameOfSubstance") {
	$currchemical{$elname} .= $currel->{ELVALUE};
    }
    elsif ($elname eq "DescriptorName") {
	$currmesh{$elname} .= $currel->{ELVALUE};
    }
    elsif ($elname eq "QualifierName") {
	## using a list for the qualifiers would be more appropriate
	## but the parser returns character entities as a separate
	## char chunk. We have to concatenate the stuff to a string
	## and fix the charent oddities later
	$currmesh{$elname} = $currmesh{$elname} . "[" . $string . "]";
    }

} ## end of Char_handler

##********************************************************************
## Default_handler(): this handler is called for everything else
## Arguments: pointer to "global" parser data (unused)
##            string
## Return value is ignored
##********************************************************************
sub Default_handler {
    my($p, $string) = @_;

    ## these data are currently not used

} ## end of Default_handler
refdb-clients 1.0.2-1 / usr / bin / med2ris