/usr/share/bibus/Import/PubMedXML.py

# Copyright 2004,2005 Pierre Martineau <pmartino@users.sourceforge.net>
# This file is part of Bibus, a bibliographic database that can
# work together with OpenOffice.org to generate bibliographic indexes.
#
# Bibus is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# Bibus is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Bibus; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA.
#
# OPENOFFICE_FIELDS=('Identifier', 'Bibliographic_Type', 'Address', 'Annote', 'Author', 'Booktitle', 'Chapter', 'Edition', 'Editor','HowPublished', 'Institution', 'Journal', 'Month', 'Note', 'Number', 'Organizations', 'Pages', 'Publisher', 'School', 'Series', 'Title', 'Report_Type', 'Volume', 'Year', 'URL', 'Custom1', 'Custom2', 'Custom3', 'Custom4', 'Custom5', 'ISBN')
# Bibliographic_Type = BIB.BIBLIOGRAPHIC_TYPE[Type[<PublicationType>]]
# see http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helppubmed.table.pubmedhelp.T41
#
from __future__ import generators	# to be removed in python 2.3
import BIB
import xml.dom.minidom

DEFAULT_ENCODING = None

class importRef(object):
	"""Class is iterable. Return records one by on without the id (first field).
	The id is added by the main program, usually it will be 'NULL' to get automatic field in MySQL
	but it may be diffrent for another database backend"""

	def __init__(self,infile):
		self.infile = infile	# must be a file type. Need a readline() function.

	def __iter__(self):
		"""Generator of records. for record in <instance>: ... """
		mydom = xml.dom.minidom.parse(self.infile)
		for article in mydom.getElementsByTagName("PubmedArticle"):
			yield self.__convertRecord(article)
			
	def __toBibusType(self,ptype):
		"""ptype is a list of PubMed type since PubMed papers can have multiple type
		We try to return the best matching OOo type or ARTICLE=0 by default"""
		if 'Technical Report' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['TECHREPORT']
		elif 'Congresses' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['PROCEEDINGS']
		elif 'Consensus Development Conference' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['PROCEEDINGS']
		elif 'Consensus Development Conference, NIH' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['PROCEEDINGS']
		elif 'Monograph' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['BOOKLET']
		elif 'Textbooks' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['BOOK']
		elif 'Laboratory Manuals' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['MANUAL']
		elif 'Meeting Abstracts' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['INPROCEEDINGS']
		elif 'Posters' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['INPROCEEDINGS']
		elif 'Clinical Conference' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['INPROCEEDINGS']
		else:
			return 0	# ARTICLE

	def __convertRecord(self,record):
		"""return a list of the record fields using
		('Identifier', 'Bibliographic_Type', 'Address', 'Annote', 'Author', 'Booktitle', 'Chapter', 'Edition', 'Editor','HowPublished', 'Institution', 'Journal', 'Month', 'Note', 'Number', 'Organizations', 'Pages', 'Publisher', 'School', 'Series', 'Title', 'Report_Type', 'Volume', 'Year', 'URL', 'Custom1', 'Custom2', 'Custom3', 'Custom4', 'Custom5', 'ISBN','Abstract')"""
		Identifier, Bibliographic_Type,Address, Annote, Author, Booktitle, Chapter, Edition, Editor,HowPublished, Institution, Journal, Month, Note, Number,Organizations, Pages,Publisher,School, Series, Title, Report_Type, Volume,Year,URL,Custom1,Custom2,Custom3,Custom4,Custom5,ISBN,Abstract = "",0,"","","","","","","","","","","","","","","","","","","","","","","","","","","","","",""
		# getting the type => list of node "PubMedType"
		pubtype = [ pt.firstChild.data for pt in record.getElementsByTagName("PublicationType") ]
		Bibliographic_Type = self.__toBibusType(pubtype)
		# journal
		j = record.getElementsByTagName("Journal")[0]
		if j.getElementsByTagName("Volume"):
			Volume = j.getElementsByTagName("Volume")[0].firstChild.data
		if j.getElementsByTagName("Issue"):
			Number = j.getElementsByTagName("Issue")[0].firstChild.data	
		
		pd = j.getElementsByTagName("PubDate")[0]									# required data Element
		if pd.getElementsByTagName("Year"):
			Year = pd.getElementsByTagName("Year")[0].firstChild.data				# required field
			if pd.getElementsByTagName("Month"):
				Month = pd.getElementsByTagName("Month")[0].firstChild.data
			#if pd.getElementsByTagName("Day"):
			#	day = pd.getElementsByTagName("Day")[0].firstChild.data
		else:
			date = pd.getElementsByTagName("MedlineDate")[0].firstChild.data		# if no year there is a MedlineDate
			Year = date.split()[0]
			try:
				Month = date.split()[1]
			except IndexError:
				Month = ""
		#journalFull = record.getElementsByTagName("Title")[0].firstChild.data
		Journal = record.getElementsByTagName("MedlineTA")[0].firstChild.data	# required data Element
		if record.getElementsByTagName("ISSN"):									# may be absent on very old
			ISBN = record.getElementsByTagName("ISSN")[0].firstChild.data			# records
		# Article data
		pmid = record.getElementsByTagName("PMID")[0].firstChild.data				# required data Element
		#URL = "http://www.ncbi.nih.gov/entrez/query.fcgi?cmd=retrieve&db=pubmed&dopt=abstract&list_uids=%s"%pmid
		#URL = "http://view.ncbi.nlm.nih.gov/pubmed/%s"%pmid
		URL = "%s/%s" %(BIB.PUBMEDVIEW,pmid)
		Title = record.getElementsByTagName("ArticleTitle")[0].firstChild.data		# required data Element
		if Title.endswith('.'):
			Title = Title[:-1].rstrip()   	# remove final dot if present
		if record.getElementsByTagName("MedlinePgn") and record.getElementsByTagName("MedlinePgn")[0].hasChildNodes():
			Pages = record.getElementsByTagName("MedlinePgn")[0].firstChild.data
		if record.getElementsByTagName("AbstractText"):
			Abstract = record.getElementsByTagName("AbstractText")[0].firstChild.data
		if record.getElementsByTagName("Affiliation"):
			Address = record.getElementsByTagName("Affiliation")[0].firstChild.data
		# Authors
		authorsList = []
		try:
			authors = record.getElementsByTagName("AuthorList")[0]
			for author in authors.getElementsByTagName("Author"):
				LastName,FirstName,MiddleName,ForeName,Suffix,Initials = "","","","","",""
				if author.getAttribute("ValidYN") == "N":								# wrong Author spelling => we drop it
					continue															# since the correct one is also present
				if author.getElementsByTagName("LastName"):
					LastName = author.getElementsByTagName("LastName")[0].firstChild.data
				if author.getElementsByTagName("FirstName"):
					FirstName = author.getElementsByTagName("FirstName")[0].firstChild.data
				if author.getElementsByTagName("MiddleName"):
					MiddleName = author.getElementsByTagName("MiddleName")[0].firstChild.data
				if author.getElementsByTagName("ForeName"):
					ForeName = author.getElementsByTagName("ForeName")[0].firstChild.data
				if author.getElementsByTagName("Suffix"):
					Suffix = author.getElementsByTagName("Suffix")[0].firstChild.data
				if author.getElementsByTagName("Initials"):
					Initials = author.getElementsByTagName("Initials")[0].firstChild.data
				#print LastName,FirstName,MiddleName,ForeName,Suffix,Initials
				pn = " ".join(" ".join((FirstName,MiddleName,ForeName,Suffix)).split())
				if pn:
					authorsList.append( ", ".join( (LastName,pn) ) )
				else:
					authorsList.append( ", ".join( (LastName,Initials) ) )
			# We test if the list of authors is complete and we eventually add "et al."
			if authors.getAttribute("CompleteYN") == "N":
				authorsList.append( "et al." )
			#
			Author = BIB.SEP.join(authorsList)
		except IndexError:
			Author = ""	# No author as in PMID = 17344863
		
		# We put keywords in Custom1
		Custom1 = BIB.SEP.join( [ k.firstChild.data for k in record.getElementsByTagName("Keyword") ] )
		#
		#print [None,Identifier, Bibliographic_Type,Address, Annote, Author, Booktitle, Chapter, Edition, Editor,HowPublished, Institution, Journal, Month, Note, Number,Organizations, Pages,Publisher,School, Series, Title, Report_Type, Volume,Year,URL,Custom1,Custom2,Custom3,Custom4,Custom5,ISBN,Abstract]
		return [None,Identifier, Bibliographic_Type,Address, Annote, Author, Booktitle, Chapter, Edition, Editor,HowPublished, Institution, Journal, Month, Note, Number,Organizations, Pages,Publisher,School, Series, Title, Report_Type, Volume,Year,URL,Custom1,Custom2,Custom3,Custom4,Custom5,ISBN,Abstract]
bibus 1.5.2-4 / usr / share / bibus / Import / PubMedXML.py