/usr/share/pyshared/cogent/parse/pdb.py is in python-cogent 1.5.3-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 | #!/usr/bin/env python
"""PDB parser class and parsing utility functions."""
from re import compile
from numpy import array, linalg
from cogent.data.protein_properties import AA_NAMES
from cogent.core.entity import StructureBuilder, ConstructionWarning, ConstructionError
__author__ = "Marcin Cieslik"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Marcin Cieslik"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Marcin Cieslik"
__email__ = "mpc4p@virginia.edu"
__status__ = "Development"
match_coords = compile('^HETATM|ATOM|MODEL') # start of coordinates
match_trailer = compile('^CONECT') # end of coordinates
# default PDB format string -> \n neccesery for writelines
# try not to parse a second \n eg. by trying to parse charge
PDB_COORDS_STRING = "%s%5i %-4s%c%3s %c%4i%c %8.3f%8.3f%8.3f%6.2f%6.2f %4s%2s\n"
PDB_TER_STRING = "%s%5i %-4s%c%3s %c%4i%c\n"
def dict2pdb(d):
"""Transform an atom dictionary into a valid PDB line."""
(x, y, z) = d['coords']
args = (d['at_type'], d['ser_num'], d['at_name'], d['alt_loc'],
d['res_name'][-3:], d['chain_id'], d['res_id'], d['res_ic'],
x , y , z , d['occupancy'], d['bfactor'], d['seg_id'], d['element'])
return PDB_COORDS_STRING % args
def dict2ter(d):
"""Transforms an atom dictionary into a valid TER line AFTER it."""
args = ('TER ', d['ser_num'] + 1, ' ', ' ',
d['res_name'][-3:], d['chain_id'], d['res_id'], d['res_ic'])
return PDB_TER_STRING % args
def pdb2dict(line):
"""Parses a valid PDB line into an atomic dictionary."""
at_type = line[0:6]
ser_num = int(line[6:11]) # numbers are ints's?
at_name = line[12:16] # " N B "
at_id = at_name.strip() # "N B"
alt_loc = line[16:17] # always keep \s
res_name = line[17:20] # non standard is 4chars long
chain_id = line[21:22]
res_id = int(line[22:26].strip()) # pdb requirement int
res_ic = line[26:27]
x = line[30:38] # no float conversion necessery
y = line[38:46] # it gets loaded straight into an
z = line[46:54] # numpy array.
occupancy = float(line[54:60])
bfactor = float(line[60:66])
seg_id = line[72:76]
element = line[76:78]
h_flag = ' ' # this is the default
if at_type == 'HETATM': # hetatms get the het flag (it's not used for writing)
if res_name not in ('MSE', 'SEL'): # SeMet are not ligands
h_flag = 'H'
res_name = '%s_%s' % (h_flag, res_name)
at_long_id = (at_id, alt_loc)
res_long_id = (res_name, res_id, res_ic)
coords = array((x, y, z)).astype("double")
result = {
'at_type': at_type, 'ser_num': ser_num, 'at_name': at_name,
'at_id': at_id, 'alt_loc': alt_loc, 'res_name': res_name,
'chain_id': chain_id, 'res_id': res_id, 'res_ic': res_ic,
'h_flag': h_flag, 'coords': coords, 'occupancy': occupancy,
'bfactor': bfactor, 'seg_id': seg_id, 'res_long_id': res_long_id,
'at_long_id': at_long_id, 'element':element}
return result
def get_symmetry(header):
"""Extracts symmetry operations from header, either by parsing of
conversion matrices (SMTRY or BIOMT) or using CCTBX based on the
space group group data name and a,b,c,alpha,beta,gamma.
"""
header_result = {}
for (mode, remark) in (('uc', 'REMARK 290 SMTRY'), ('bio', 'REMARK 350 BIOMT')):
# parsing the raw-header matrices
# uc: parsing the symmetry matrices for a given space group
# bio: parsing the symmetry matrices to construct the biological molecule
#'REMARK 290 SMTRY3 96 -1.000000 0.000000 0.000000 0.00000'
# how should we deal with the ORIGx matrix?
# needed function to check if the SCALEn matrix is correct with
# respect to the CRYST1 card.
mx_num = 1
new_chain = False
fmx, mxs, mx, cmx = [], [], [], []
for line in header:
if line.startswith('SCALE') and mode == 'uc':
data = map(float, line[6:].split()[:-1])
fmx.append(data)
elif line.startswith(remark):
m_line = map(float, line[20:].split())
if mx_num != m_line[0] or new_chain:
mx.append([ 0., 0., 0., 1.])
mxs.append(mx)
mx = []
if mode == 'bio':
if new_chain:
new_chain = False
else:
cmx[-1][1] += 1
mx_num = m_line[0]
mx.append(m_line[1:])
elif line.startswith('REMARK 350 APPLY') and mode == 'bio':
if cmx: new_chain = True; cmx[-1][1] += 1
chains = [(c.strip(),) for c in line[42:].split(',')]
cmx.append([chains, 0])
mx.append([ 0., 0., 0., 1.]) # finish the last matrix
mxs.append(mx)
mxs = array(mxs) # symmetry matrices
header_result[mode + "_mxs"] = mxs
if mode == 'uc':
fmx = array(fmx) # fractionalization_matrix
omx = linalg.inv(fmx) # orthogonalization_matrix
header_result["uc_fmx"] = fmx
header_result["uc_omx"] = omx
elif mode == 'bio':
cmx[-1][1] += 1
header_result[mode + "_cmx"] = cmx
return header_result
def get_coords_offset(line_list):
"""Determine the line number where coordinates begin."""
i = 0
for i, line in enumerate(line_list):
if match_coords.match(line):
break
return i
def get_trailer_offset(line_list):
"""Determine the line number where coordinates end."""
i = 0
for i, line in enumerate(line_list):
if match_trailer.match(line):
break
return i
def parse_header(header):
"""Parse parts of the PDB header."""
id = (compile('HEADER\s{4}\S+.*\S+\s*\S{9}\s+(\S{4})\s*$'), 'id')
dt = (compile('HEADER\s{4}\S+.*\S+\s+(\S{9})\s+\S{4}\s*$'), 'date')
nm = (compile('HEADER\s{4}(\S+.*\S+)\s+\S{9}\s+\S{4}\s*$'), 'name')
mc = (compile('REMARK 280\s+MATTHEWS COEFFICIENT,\s+VM\s+\(ANGSTROMS\*\*3/DA\):\s+(\d+\.\d+)'), 'matthews')
sc = (compile('REMARK 280\s+SOLVENT CONTENT,\s+VS\s+\(%\):\s+(\d+\.\d+)'), 'solvent_content')
sg = (compile('REMARK 290\s+SYMMETRY OPERATORS FOR SPACE GROUP:\s+(.*\S)'), 'space_group')
xt = (compile('REMARK 200\s+EXPERIMENT TYPE\s+:\s+(.*\S)'), 'experiment_type')
rs = (compile('REMARK 2\s+RESOLUTION\.\s+(\d+\.\d+)\s+ANGSTROMS\.'), 'resolution')
rf = (compile('REMARK 3\s+FREE R VALUE\s+:\s+(\d+\.\d+)'), 'r_free')
xd = (compile('EXPDTA\s+([\w\-]+).*'), 'expdta')
c1 = (compile('CRYST1\s+((\d+\.\d+\s+){6})'), 'cryst1')
ra = (compile('DBREF\s+\S{4}\s+\S\s+\d+\s+\d+\s+\S+\s+(\S+)\s+\S+\s+\d+\s+\d+\s+$'), 'dbref_acc')
rx = (compile('DBREF\s+\S{4}\s+\S\s+\d+\s+\d+\s+\S+\s+\S+\s+(\S+)\s+\d+\s+\d+\s+$'), 'dbref_acc_full')
#CRYST1 60.456 60.456 82.526 90.00 90.00 90.00 P 41 4
#DBREF 1UI9 A 1 122 UNP Q84FH6 Q84FH6_THETH 1 122 \n'
tests = [id, dt, nm, mc, sc, sg, xt, rs, rf, xd, c1, ra, rx]
results = {}
for line in header:
for (regexp, name) in tests:
try:
results.update({name:regexp.search(line).group(1).strip()})
continue # taking only the first grep.
except AttributeError:
pass
return results
def parse_coords(builder, coords, forgive=1):
"""Parse coordinate lines."""
current_model_id = 0
model_open = False
current_chain_id = None
current_seg_id = None
current_res_long_id = None
current_res_name = None
for line in coords:
record_type = line[0:6]
if record_type == 'MODEL ':
builder.initModel(current_model_id)
current_model_id += 1
model_open = True
current_chain_id = None
current_res_id = None
elif record_type == 'ENDMDL':
model_open = False
current_chain_id = None
current_res_id = None
elif record_type == 'ATOM ' or record_type == 'HETATM':
if not model_open:
builder.initModel(current_model_id)
current_model_id += 1
model_open = 1
new_chain = False
fields = pdb2dict(line)
if current_seg_id != fields['seg_id']:
current_seg_id = fields['seg_id']
builder.initSeg(current_seg_id)
if current_chain_id != fields['chain_id']:
current_chain_id = fields['chain_id']
new_chain = True
try:
builder.initChain(current_chain_id)
except ConstructionWarning:
if not forgive:
raise ConstructionError
if current_res_name != fields['res_name'] or current_res_long_id != fields['res_long_id'] or new_chain:
current_res_long_id = fields['res_long_id']
current_res_name = fields['res_name']
try:
builder.initResidue(fields['res_long_id'], fields['h_flag'])
except ConstructionWarning:
if not forgive:
raise ConstructionError
new_chain = False
try:
builder.initAtom(fields['at_long_id'], fields['at_name'], fields['ser_num'], \
fields['coords'], fields['occupancy'], fields['bfactor'], \
fields['element'])
except ConstructionError:
if not forgive > 1:
raise ConstructionError
return builder.getStructure()
def parse_trailer(trailer):
return {}
def PDBParser(open_file, structure_id=None, forgive=2):
"""Parse a PDB file and return a Structure object."""
file_ = open_file.readlines()
builder = StructureBuilder()
c_offset = get_coords_offset(file_)
t_offset = get_trailer_offset(file_)
raw_coords = file_[c_offset:t_offset]
raw_trailer = file_[t_offset:]
raw_header = file_[:c_offset]
parsed_header = parse_header(raw_header)
parsed_trailer = parse_trailer(raw_trailer)
structure_id = (structure_id or parsed_header.get('id'))
builder.initStructure(structure_id)
structure = parse_coords(builder, raw_coords, forgive)
# only X-ray structures will contain crystallographic data
if parsed_header.get('expdta') == 'X-RAY':
symetry_info = get_symmetry(raw_header)
parsed_header.update(symetry_info)
structure.header = parsed_header
structure.trailer = parsed_trailer
structure.raw_header = raw_header
structure.raw_trailer = raw_trailer
return structure
|