/usr/share/bibus/Import/PubMedXML.py is in bibus 1.5.2-4.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | # Copyright 2004,2005 Pierre Martineau <pmartino@users.sourceforge.net>
# This file is part of Bibus, a bibliographic database that can
# work together with OpenOffice.org to generate bibliographic indexes.
#
# Bibus is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# Bibus is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Bibus; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
#
# OPENOFFICE_FIELDS=('Identifier', 'Bibliographic_Type', 'Address', 'Annote', 'Author', 'Booktitle', 'Chapter', 'Edition', 'Editor','HowPublished', 'Institution', 'Journal', 'Month', 'Note', 'Number', 'Organizations', 'Pages', 'Publisher', 'School', 'Series', 'Title', 'Report_Type', 'Volume', 'Year', 'URL', 'Custom1', 'Custom2', 'Custom3', 'Custom4', 'Custom5', 'ISBN')
# Bibliographic_Type = BIB.BIBLIOGRAPHIC_TYPE[Type[<PublicationType>]]
# see http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helppubmed.table.pubmedhelp.T41
#
from __future__ import generators # to be removed in python 2.3
import BIB
import xml.dom.minidom
DEFAULT_ENCODING = None
class importRef(object):
"""Class is iterable. Return records one by on without the id (first field).
The id is added by the main program, usually it will be 'NULL' to get automatic field in MySQL
but it may be diffrent for another database backend"""
def __init__(self,infile):
self.infile = infile # must be a file type. Need a readline() function.
def __iter__(self):
"""Generator of records. for record in <instance>: ... """
mydom = xml.dom.minidom.parse(self.infile)
for article in mydom.getElementsByTagName("PubmedArticle"):
yield self.__convertRecord(article)
def __toBibusType(self,ptype):
"""ptype is a list of PubMed type since PubMed papers can have multiple type
We try to return the best matching OOo type or ARTICLE=0 by default"""
if 'Technical Report' in ptype:
return BIB.BIBLIOGRAPHIC_TYPE['TECHREPORT']
elif 'Congresses' in ptype:
return BIB.BIBLIOGRAPHIC_TYPE['PROCEEDINGS']
elif 'Consensus Development Conference' in ptype:
return BIB.BIBLIOGRAPHIC_TYPE['PROCEEDINGS']
elif 'Consensus Development Conference, NIH' in ptype:
return BIB.BIBLIOGRAPHIC_TYPE['PROCEEDINGS']
elif 'Monograph' in ptype:
return BIB.BIBLIOGRAPHIC_TYPE['BOOKLET']
elif 'Textbooks' in ptype:
return BIB.BIBLIOGRAPHIC_TYPE['BOOK']
elif 'Laboratory Manuals' in ptype:
return BIB.BIBLIOGRAPHIC_TYPE['MANUAL']
elif 'Meeting Abstracts' in ptype:
return BIB.BIBLIOGRAPHIC_TYPE['INPROCEEDINGS']
elif 'Posters' in ptype:
return BIB.BIBLIOGRAPHIC_TYPE['INPROCEEDINGS']
elif 'Clinical Conference' in ptype:
return BIB.BIBLIOGRAPHIC_TYPE['INPROCEEDINGS']
else:
return 0 # ARTICLE
def __convertRecord(self,record):
"""return a list of the record fields using
('Identifier', 'Bibliographic_Type', 'Address', 'Annote', 'Author', 'Booktitle', 'Chapter', 'Edition', 'Editor','HowPublished', 'Institution', 'Journal', 'Month', 'Note', 'Number', 'Organizations', 'Pages', 'Publisher', 'School', 'Series', 'Title', 'Report_Type', 'Volume', 'Year', 'URL', 'Custom1', 'Custom2', 'Custom3', 'Custom4', 'Custom5', 'ISBN','Abstract')"""
Identifier, Bibliographic_Type,Address, Annote, Author, Booktitle, Chapter, Edition, Editor,HowPublished, Institution, Journal, Month, Note, Number,Organizations, Pages,Publisher,School, Series, Title, Report_Type, Volume,Year,URL,Custom1,Custom2,Custom3,Custom4,Custom5,ISBN,Abstract = "",0,"","","","","","","","","","","","","","","","","","","","","","","","","","","","","",""
# getting the type => list of node "PubMedType"
pubtype = [ pt.firstChild.data for pt in record.getElementsByTagName("PublicationType") ]
Bibliographic_Type = self.__toBibusType(pubtype)
# journal
j = record.getElementsByTagName("Journal")[0]
if j.getElementsByTagName("Volume"):
Volume = j.getElementsByTagName("Volume")[0].firstChild.data
if j.getElementsByTagName("Issue"):
Number = j.getElementsByTagName("Issue")[0].firstChild.data
pd = j.getElementsByTagName("PubDate")[0] # required data Element
if pd.getElementsByTagName("Year"):
Year = pd.getElementsByTagName("Year")[0].firstChild.data # required field
if pd.getElementsByTagName("Month"):
Month = pd.getElementsByTagName("Month")[0].firstChild.data
#if pd.getElementsByTagName("Day"):
# day = pd.getElementsByTagName("Day")[0].firstChild.data
else:
date = pd.getElementsByTagName("MedlineDate")[0].firstChild.data # if no year there is a MedlineDate
Year = date.split()[0]
try:
Month = date.split()[1]
except IndexError:
Month = ""
#journalFull = record.getElementsByTagName("Title")[0].firstChild.data
Journal = record.getElementsByTagName("MedlineTA")[0].firstChild.data # required data Element
if record.getElementsByTagName("ISSN"): # may be absent on very old
ISBN = record.getElementsByTagName("ISSN")[0].firstChild.data # records
# Article data
pmid = record.getElementsByTagName("PMID")[0].firstChild.data # required data Element
#URL = "http://www.ncbi.nih.gov/entrez/query.fcgi?cmd=retrieve&db=pubmed&dopt=abstract&list_uids=%s"%pmid
#URL = "http://view.ncbi.nlm.nih.gov/pubmed/%s"%pmid
URL = "%s/%s" %(BIB.PUBMEDVIEW,pmid)
Title = record.getElementsByTagName("ArticleTitle")[0].firstChild.data # required data Element
if Title.endswith('.'):
Title = Title[:-1].rstrip() # remove final dot if present
if record.getElementsByTagName("MedlinePgn") and record.getElementsByTagName("MedlinePgn")[0].hasChildNodes():
Pages = record.getElementsByTagName("MedlinePgn")[0].firstChild.data
if record.getElementsByTagName("AbstractText"):
Abstract = record.getElementsByTagName("AbstractText")[0].firstChild.data
if record.getElementsByTagName("Affiliation"):
Address = record.getElementsByTagName("Affiliation")[0].firstChild.data
# Authors
authorsList = []
try:
authors = record.getElementsByTagName("AuthorList")[0]
for author in authors.getElementsByTagName("Author"):
LastName,FirstName,MiddleName,ForeName,Suffix,Initials = "","","","","",""
if author.getAttribute("ValidYN") == "N": # wrong Author spelling => we drop it
continue # since the correct one is also present
if author.getElementsByTagName("LastName"):
LastName = author.getElementsByTagName("LastName")[0].firstChild.data
if author.getElementsByTagName("FirstName"):
FirstName = author.getElementsByTagName("FirstName")[0].firstChild.data
if author.getElementsByTagName("MiddleName"):
MiddleName = author.getElementsByTagName("MiddleName")[0].firstChild.data
if author.getElementsByTagName("ForeName"):
ForeName = author.getElementsByTagName("ForeName")[0].firstChild.data
if author.getElementsByTagName("Suffix"):
Suffix = author.getElementsByTagName("Suffix")[0].firstChild.data
if author.getElementsByTagName("Initials"):
Initials = author.getElementsByTagName("Initials")[0].firstChild.data
#print LastName,FirstName,MiddleName,ForeName,Suffix,Initials
pn = " ".join(" ".join((FirstName,MiddleName,ForeName,Suffix)).split())
if pn:
authorsList.append( ", ".join( (LastName,pn) ) )
else:
authorsList.append( ", ".join( (LastName,Initials) ) )
# We test if the list of authors is complete and we eventually add "et al."
if authors.getAttribute("CompleteYN") == "N":
authorsList.append( "et al." )
#
Author = BIB.SEP.join(authorsList)
except IndexError:
Author = "" # No author as in PMID = 17344863
# We put keywords in Custom1
Custom1 = BIB.SEP.join( [ k.firstChild.data for k in record.getElementsByTagName("Keyword") ] )
#
#print [None,Identifier, Bibliographic_Type,Address, Annote, Author, Booktitle, Chapter, Edition, Editor,HowPublished, Institution, Journal, Month, Note, Number,Organizations, Pages,Publisher,School, Series, Title, Report_Type, Volume,Year,URL,Custom1,Custom2,Custom3,Custom4,Custom5,ISBN,Abstract]
return [None,Identifier, Bibliographic_Type,Address, Annote, Author, Booktitle, Chapter, Edition, Editor,HowPublished, Institution, Journal, Month, Note, Number,Organizations, Pages,Publisher,School, Series, Title, Report_Type, Volume,Year,URL,Custom1,Custom2,Custom3,Custom4,Custom5,ISBN,Abstract]
|