This file is indexed.

/usr/share/bibus/Import/PubMedXML.py is in bibus 1.5.2-4.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# Copyright 2004,2005 Pierre Martineau <pmartino@users.sourceforge.net>
# This file is part of Bibus, a bibliographic database that can
# work together with OpenOffice.org to generate bibliographic indexes.
#
# Bibus is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# Bibus is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Bibus; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA.
#
# OPENOFFICE_FIELDS=('Identifier', 'Bibliographic_Type', 'Address', 'Annote', 'Author', 'Booktitle', 'Chapter', 'Edition', 'Editor','HowPublished', 'Institution', 'Journal', 'Month', 'Note', 'Number', 'Organizations', 'Pages', 'Publisher', 'School', 'Series', 'Title', 'Report_Type', 'Volume', 'Year', 'URL', 'Custom1', 'Custom2', 'Custom3', 'Custom4', 'Custom5', 'ISBN')
# Bibliographic_Type = BIB.BIBLIOGRAPHIC_TYPE[Type[<PublicationType>]]
# see http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helppubmed.table.pubmedhelp.T41
#
from __future__ import generators	# to be removed in python 2.3
import BIB
import xml.dom.minidom

DEFAULT_ENCODING = None

class importRef(object):
	"""Class is iterable. Return records one by on without the id (first field).
	The id is added by the main program, usually it will be 'NULL' to get automatic field in MySQL
	but it may be diffrent for another database backend"""

	def __init__(self,infile):
		self.infile = infile	# must be a file type. Need a readline() function.

	def __iter__(self):
		"""Generator of records. for record in <instance>: ... """
		mydom = xml.dom.minidom.parse(self.infile)
		for article in mydom.getElementsByTagName("PubmedArticle"):
			yield self.__convertRecord(article)
			
	def __toBibusType(self,ptype):
		"""ptype is a list of PubMed type since PubMed papers can have multiple type
		We try to return the best matching OOo type or ARTICLE=0 by default"""
		if 'Technical Report' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['TECHREPORT']
		elif 'Congresses' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['PROCEEDINGS']
		elif 'Consensus Development Conference' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['PROCEEDINGS']
		elif 'Consensus Development Conference, NIH' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['PROCEEDINGS']
		elif 'Monograph' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['BOOKLET']
		elif 'Textbooks' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['BOOK']
		elif 'Laboratory Manuals' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['MANUAL']
		elif 'Meeting Abstracts' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['INPROCEEDINGS']
		elif 'Posters' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['INPROCEEDINGS']
		elif 'Clinical Conference' in ptype:
			return BIB.BIBLIOGRAPHIC_TYPE['INPROCEEDINGS']
		else:
			return 0	# ARTICLE

	def __convertRecord(self,record):
		"""return a list of the record fields using
		('Identifier', 'Bibliographic_Type', 'Address', 'Annote', 'Author', 'Booktitle', 'Chapter', 'Edition', 'Editor','HowPublished', 'Institution', 'Journal', 'Month', 'Note', 'Number', 'Organizations', 'Pages', 'Publisher', 'School', 'Series', 'Title', 'Report_Type', 'Volume', 'Year', 'URL', 'Custom1', 'Custom2', 'Custom3', 'Custom4', 'Custom5', 'ISBN','Abstract')"""
		Identifier, Bibliographic_Type,Address, Annote, Author, Booktitle, Chapter, Edition, Editor,HowPublished, Institution, Journal, Month, Note, Number,Organizations, Pages,Publisher,School, Series, Title, Report_Type, Volume,Year,URL,Custom1,Custom2,Custom3,Custom4,Custom5,ISBN,Abstract = "",0,"","","","","","","","","","","","","","","","","","","","","","","","","","","","","",""
		# getting the type => list of node "PubMedType"
		pubtype = [ pt.firstChild.data for pt in record.getElementsByTagName("PublicationType") ]
		Bibliographic_Type = self.__toBibusType(pubtype)
		# journal
		j = record.getElementsByTagName("Journal")[0]
		if j.getElementsByTagName("Volume"):
			Volume = j.getElementsByTagName("Volume")[0].firstChild.data
		if j.getElementsByTagName("Issue"):
			Number = j.getElementsByTagName("Issue")[0].firstChild.data	
		
		pd = j.getElementsByTagName("PubDate")[0]									# required data Element
		if pd.getElementsByTagName("Year"):
			Year = pd.getElementsByTagName("Year")[0].firstChild.data				# required field
			if pd.getElementsByTagName("Month"):
				Month = pd.getElementsByTagName("Month")[0].firstChild.data
			#if pd.getElementsByTagName("Day"):
			#	day = pd.getElementsByTagName("Day")[0].firstChild.data
		else:
			date = pd.getElementsByTagName("MedlineDate")[0].firstChild.data		# if no year there is a MedlineDate
			Year = date.split()[0]
			try:
				Month = date.split()[1]
			except IndexError:
				Month = ""
		#journalFull = record.getElementsByTagName("Title")[0].firstChild.data
		Journal = record.getElementsByTagName("MedlineTA")[0].firstChild.data	# required data Element
		if record.getElementsByTagName("ISSN"):									# may be absent on very old
			ISBN = record.getElementsByTagName("ISSN")[0].firstChild.data			# records
		# Article data
		pmid = record.getElementsByTagName("PMID")[0].firstChild.data				# required data Element
		#URL = "http://www.ncbi.nih.gov/entrez/query.fcgi?cmd=retrieve&db=pubmed&dopt=abstract&list_uids=%s"%pmid
		#URL = "http://view.ncbi.nlm.nih.gov/pubmed/%s"%pmid
		URL = "%s/%s" %(BIB.PUBMEDVIEW,pmid)
		Title = record.getElementsByTagName("ArticleTitle")[0].firstChild.data		# required data Element
		if Title.endswith('.'):
			Title = Title[:-1].rstrip()   	# remove final dot if present
		if record.getElementsByTagName("MedlinePgn") and record.getElementsByTagName("MedlinePgn")[0].hasChildNodes():
			Pages = record.getElementsByTagName("MedlinePgn")[0].firstChild.data
		if record.getElementsByTagName("AbstractText"):
			Abstract = record.getElementsByTagName("AbstractText")[0].firstChild.data
		if record.getElementsByTagName("Affiliation"):
			Address = record.getElementsByTagName("Affiliation")[0].firstChild.data
		# Authors
		authorsList = []
		try:
			authors = record.getElementsByTagName("AuthorList")[0]
			for author in authors.getElementsByTagName("Author"):
				LastName,FirstName,MiddleName,ForeName,Suffix,Initials = "","","","","",""
				if author.getAttribute("ValidYN") == "N":								# wrong Author spelling => we drop it
					continue															# since the correct one is also present
				if author.getElementsByTagName("LastName"):
					LastName = author.getElementsByTagName("LastName")[0].firstChild.data
				if author.getElementsByTagName("FirstName"):
					FirstName = author.getElementsByTagName("FirstName")[0].firstChild.data
				if author.getElementsByTagName("MiddleName"):
					MiddleName = author.getElementsByTagName("MiddleName")[0].firstChild.data
				if author.getElementsByTagName("ForeName"):
					ForeName = author.getElementsByTagName("ForeName")[0].firstChild.data
				if author.getElementsByTagName("Suffix"):
					Suffix = author.getElementsByTagName("Suffix")[0].firstChild.data
				if author.getElementsByTagName("Initials"):
					Initials = author.getElementsByTagName("Initials")[0].firstChild.data
				#print LastName,FirstName,MiddleName,ForeName,Suffix,Initials
				pn = " ".join(" ".join((FirstName,MiddleName,ForeName,Suffix)).split())
				if pn:
					authorsList.append( ", ".join( (LastName,pn) ) )
				else:
					authorsList.append( ", ".join( (LastName,Initials) ) )
			# We test if the list of authors is complete and we eventually add "et al."
			if authors.getAttribute("CompleteYN") == "N":
				authorsList.append( "et al." )
			#
			Author = BIB.SEP.join(authorsList)
		except IndexError:
			Author = ""	# No author as in PMID = 17344863
		
		# We put keywords in Custom1
		Custom1 = BIB.SEP.join( [ k.firstChild.data for k in record.getElementsByTagName("Keyword") ] )
		#
		#print [None,Identifier, Bibliographic_Type,Address, Annote, Author, Booktitle, Chapter, Edition, Editor,HowPublished, Institution, Journal, Month, Note, Number,Organizations, Pages,Publisher,School, Series, Title, Report_Type, Volume,Year,URL,Custom1,Custom2,Custom3,Custom4,Custom5,ISBN,Abstract]
		return [None,Identifier, Bibliographic_Type,Address, Annote, Author, Booktitle, Chapter, Edition, Editor,HowPublished, Institution, Journal, Month, Note, Number,Organizations, Pages,Publisher,School, Series, Title, Report_Type, Volume,Year,URL,Custom1,Custom2,Custom3,Custom4,Custom5,ISBN,Abstract]