1# Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) 2# This code is part of the Biopython distribution and governed by its 3# license. Please see the LICENSE file that should have been included 4# as part of this package. 5 6"""mmCIF parsers.""" 7 8 9import numpy 10import warnings 11 12from Bio.File import as_handle 13 14from Bio.PDB.MMCIF2Dict import MMCIF2Dict 15from Bio.PDB.StructureBuilder import StructureBuilder 16from Bio.PDB.PDBExceptions import PDBConstructionException 17from Bio.PDB.PDBExceptions import PDBConstructionWarning 18 19 20class MMCIFParser: 21 """Parse a mmCIF file and return a Structure object.""" 22 23 def __init__(self, structure_builder=None, QUIET=False): 24 """Create a PDBParser object. 25 26 The mmCIF parser calls a number of standard methods in an aggregated 27 StructureBuilder object. Normally this object is instanciated by the 28 MMCIParser object itself, but if the user provides his/her own 29 StructureBuilder object, the latter is used instead. 30 31 Arguments: 32 - structure_builder - an optional user implemented StructureBuilder class. 33 - QUIET - Evaluated as a Boolean. If true, warnings issued in constructing 34 the SMCRA data will be suppressed. If false (DEFAULT), they will be shown. 35 These warnings might be indicative of problems in the mmCIF file! 36 37 """ 38 if structure_builder is not None: 39 self._structure_builder = structure_builder 40 else: 41 self._structure_builder = StructureBuilder() 42 self.header = None 43 # self.trailer = None 44 self.line_counter = 0 45 self.build_structure = None 46 self.QUIET = bool(QUIET) 47 48 # Public methods 49 50 def get_structure(self, structure_id, filename): 51 """Return the structure. 52 53 Arguments: 54 - structure_id - string, the id that will be used for the structure 55 - filename - name of mmCIF file, OR an open text mode file handle 56 57 """ 58 with warnings.catch_warnings(): 59 if self.QUIET: 60 warnings.filterwarnings("ignore", category=PDBConstructionWarning) 61 self._mmcif_dict = MMCIF2Dict(filename) 62 self._build_structure(structure_id) 63 self._structure_builder.set_header(self._get_header()) 64 65 return self._structure_builder.get_structure() 66 67 # Private methods 68 69 def _mmcif_get(self, key, dict, deflt): 70 if key in dict: 71 rslt = dict[key][0] 72 if "?" != rslt: 73 return rslt 74 return deflt 75 76 def _update_header_entry(self, target_key, keys): 77 md = self._mmcif_dict 78 for key in keys: 79 val = md.get(key) 80 try: 81 item = val[0] 82 except (TypeError, IndexError): 83 continue 84 if item != "?": 85 self.header[target_key] = item 86 break 87 88 def _get_header(self): 89 self.header = { 90 "name": "", 91 "head": "", 92 "idcode": "", 93 "deposition_date": "", 94 "structure_method": "", 95 "resolution": None, 96 } 97 98 self._update_header_entry( 99 "idcode", ["_entry_id", "_exptl.entry_id", "_struct.entry_id"] 100 ) 101 self._update_header_entry("name", ["_struct.title"]) 102 self._update_header_entry( 103 "head", ["_struct_keywords.pdbx_keywords", "_struct_keywords.text"] 104 ) 105 self._update_header_entry( 106 "deposition_date", ["_pdbx_database_status.recvd_initial_deposition_date"] 107 ) 108 self._update_header_entry("structure_method", ["_exptl.method"]) 109 self._update_header_entry( 110 "resolution", 111 [ 112 "_refine.ls_d_res_high", 113 "_refine_hist.d_res_high", 114 "_em_3d_reconstruction.resolution", 115 ], 116 ) 117 if self.header["resolution"] is not None: 118 try: 119 self.header["resolution"] = float(self.header["resolution"]) 120 except ValueError: 121 self.header["resolution"] = None 122 123 return self.header 124 125 def _build_structure(self, structure_id): 126 127 # two special chars as placeholders in the mmCIF format 128 # for item values that cannot be explicitly assigned 129 # see: pdbx/mmcif syntax web page 130 _unassigned = {".", "?"} 131 132 mmcif_dict = self._mmcif_dict 133 134 atom_serial_list = mmcif_dict["_atom_site.id"] 135 atom_id_list = mmcif_dict["_atom_site.label_atom_id"] 136 residue_id_list = mmcif_dict["_atom_site.label_comp_id"] 137 try: 138 element_list = mmcif_dict["_atom_site.type_symbol"] 139 except KeyError: 140 element_list = None 141 chain_id_list = mmcif_dict["_atom_site.auth_asym_id"] 142 x_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_x"]] 143 y_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_y"]] 144 z_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_z"]] 145 alt_list = mmcif_dict["_atom_site.label_alt_id"] 146 icode_list = mmcif_dict["_atom_site.pdbx_PDB_ins_code"] 147 b_factor_list = mmcif_dict["_atom_site.B_iso_or_equiv"] 148 occupancy_list = mmcif_dict["_atom_site.occupancy"] 149 fieldname_list = mmcif_dict["_atom_site.group_PDB"] 150 try: 151 serial_list = [int(n) for n in mmcif_dict["_atom_site.pdbx_PDB_model_num"]] 152 except KeyError: 153 # No model number column 154 serial_list = None 155 except ValueError: 156 # Invalid model number (malformed file) 157 raise PDBConstructionException("Invalid model number") from None 158 try: 159 aniso_u11 = mmcif_dict["_atom_site_anisotrop.U[1][1]"] 160 aniso_u12 = mmcif_dict["_atom_site_anisotrop.U[1][2]"] 161 aniso_u13 = mmcif_dict["_atom_site_anisotrop.U[1][3]"] 162 aniso_u22 = mmcif_dict["_atom_site_anisotrop.U[2][2]"] 163 aniso_u23 = mmcif_dict["_atom_site_anisotrop.U[2][3]"] 164 aniso_u33 = mmcif_dict["_atom_site_anisotrop.U[3][3]"] 165 aniso_flag = 1 166 except KeyError: 167 # no anisotropic B factors 168 aniso_flag = 0 169 # if auth_seq_id is present, we use this. 170 # Otherwise label_seq_id is used. 171 if "_atom_site.auth_seq_id" in mmcif_dict: 172 seq_id_list = mmcif_dict["_atom_site.auth_seq_id"] 173 else: 174 seq_id_list = mmcif_dict["_atom_site.label_seq_id"] 175 # Now loop over atoms and build the structure 176 current_chain_id = None 177 current_residue_id = None 178 current_resname = None 179 structure_builder = self._structure_builder 180 structure_builder.init_structure(structure_id) 181 structure_builder.init_seg(" ") 182 # Historically, Biopython PDB parser uses model_id to mean array index 183 # so serial_id means the Model ID specified in the file 184 current_model_id = -1 185 current_serial_id = -1 186 for i in range(0, len(atom_id_list)): 187 188 # set the line_counter for 'ATOM' lines only and not 189 # as a global line counter found in the PDBParser() 190 structure_builder.set_line_counter(i) 191 192 # Try coercing serial to int, for compatibility with PDBParser 193 # But do not quit if it fails. mmCIF format specs allow strings. 194 try: 195 serial = int(atom_serial_list[i]) 196 except ValueError: 197 serial = atom_serial_list[i] 198 warnings.warn( 199 "PDBConstructionWarning: " 200 "Some atom serial numbers are not numerical", 201 PDBConstructionWarning, 202 ) 203 204 x = x_list[i] 205 y = y_list[i] 206 z = z_list[i] 207 resname = residue_id_list[i] 208 chainid = chain_id_list[i] 209 altloc = alt_list[i] 210 if altloc in _unassigned: 211 altloc = " " 212 int_resseq = int(seq_id_list[i]) 213 icode = icode_list[i] 214 if icode in _unassigned: 215 icode = " " 216 name = atom_id_list[i] 217 # occupancy & B factor 218 try: 219 tempfactor = float(b_factor_list[i]) 220 except ValueError: 221 raise PDBConstructionException("Invalid or missing B factor") from None 222 try: 223 occupancy = float(occupancy_list[i]) 224 except ValueError: 225 raise PDBConstructionException("Invalid or missing occupancy") from None 226 fieldname = fieldname_list[i] 227 if fieldname == "HETATM": 228 if resname == "HOH" or resname == "WAT": 229 hetatm_flag = "W" 230 else: 231 hetatm_flag = "H" 232 else: 233 hetatm_flag = " " 234 235 resseq = (hetatm_flag, int_resseq, icode) 236 237 if serial_list is not None: 238 # model column exists; use it 239 serial_id = serial_list[i] 240 if current_serial_id != serial_id: 241 # if serial changes, update it and start new model 242 current_serial_id = serial_id 243 current_model_id += 1 244 structure_builder.init_model(current_model_id, current_serial_id) 245 current_chain_id = None 246 current_residue_id = None 247 current_resname = None 248 else: 249 # no explicit model column; initialize single model 250 structure_builder.init_model(current_model_id) 251 252 if current_chain_id != chainid: 253 current_chain_id = chainid 254 structure_builder.init_chain(current_chain_id) 255 current_residue_id = None 256 current_resname = None 257 258 if current_residue_id != resseq or current_resname != resname: 259 current_residue_id = resseq 260 current_resname = resname 261 structure_builder.init_residue(resname, hetatm_flag, int_resseq, icode) 262 263 coord = numpy.array((x, y, z), "f") 264 element = element_list[i].upper() if element_list else None 265 structure_builder.init_atom( 266 name, 267 coord, 268 tempfactor, 269 occupancy, 270 altloc, 271 name, 272 serial_number=serial, 273 element=element, 274 ) 275 if aniso_flag == 1 and i < len(aniso_u11): 276 u = ( 277 aniso_u11[i], 278 aniso_u12[i], 279 aniso_u13[i], 280 aniso_u22[i], 281 aniso_u23[i], 282 aniso_u33[i], 283 ) 284 mapped_anisou = [float(_) for _ in u] 285 anisou_array = numpy.array(mapped_anisou, "f") 286 structure_builder.set_anisou(anisou_array) 287 # Now try to set the cell 288 try: 289 a = float(mmcif_dict["_cell.length_a"][0]) 290 b = float(mmcif_dict["_cell.length_b"][0]) 291 c = float(mmcif_dict["_cell.length_c"][0]) 292 alpha = float(mmcif_dict["_cell.angle_alpha"][0]) 293 beta = float(mmcif_dict["_cell.angle_beta"][0]) 294 gamma = float(mmcif_dict["_cell.angle_gamma"][0]) 295 cell = numpy.array((a, b, c, alpha, beta, gamma), "f") 296 spacegroup = mmcif_dict["_symmetry.space_group_name_H-M"][0] 297 spacegroup = spacegroup[1:-1] # get rid of quotes!! 298 if spacegroup is None: 299 raise Exception 300 structure_builder.set_symmetry(spacegroup, cell) 301 except Exception: 302 pass # no cell found, so just ignore 303 304 305class FastMMCIFParser: 306 """Parse an MMCIF file and return a Structure object.""" 307 308 def __init__(self, structure_builder=None, QUIET=False): 309 """Create a FastMMCIFParser object. 310 311 The mmCIF parser calls a number of standard methods in an aggregated 312 StructureBuilder object. Normally this object is instanciated by the 313 parser object itself, but if the user provides his/her own 314 StructureBuilder object, the latter is used instead. 315 316 The main difference between this class and the regular MMCIFParser is 317 that only 'ATOM' and 'HETATM' lines are parsed here. Use if you are 318 interested only in coordinate information. 319 320 Arguments: 321 - structure_builder - an optional user implemented StructureBuilder class. 322 - QUIET - Evaluated as a Boolean. If true, warnings issued in constructing 323 the SMCRA data will be suppressed. If false (DEFAULT), they will be shown. 324 These warnings might be indicative of problems in the mmCIF file! 325 326 """ 327 if structure_builder is not None: 328 self._structure_builder = structure_builder 329 else: 330 self._structure_builder = StructureBuilder() 331 332 self.line_counter = 0 333 self.build_structure = None 334 self.QUIET = bool(QUIET) 335 336 # Public methods 337 338 def get_structure(self, structure_id, filename): 339 """Return the structure. 340 341 Arguments: 342 - structure_id - string, the id that will be used for the structure 343 - filename - name of the mmCIF file OR an open filehandle 344 345 """ 346 with warnings.catch_warnings(): 347 if self.QUIET: 348 warnings.filterwarnings("ignore", category=PDBConstructionWarning) 349 with as_handle(filename) as handle: 350 self._build_structure(structure_id, handle) 351 352 return self._structure_builder.get_structure() 353 354 # Private methods 355 356 def _build_structure(self, structure_id, filehandle): 357 358 # two special chars as placeholders in the mmCIF format 359 # for item values that cannot be explicitly assigned 360 # see: pdbx/mmcif syntax web page 361 _unassigned = {".", "?"} 362 363 # Read only _atom_site. and atom_site_anisotrop entries 364 read_atom, read_aniso = False, False 365 _fields, _records = [], [] 366 _anisof, _anisors = [], [] 367 for line in filehandle: 368 if line.startswith("_atom_site."): 369 read_atom = True 370 _fields.append(line.strip()) 371 elif line.startswith("_atom_site_anisotrop."): 372 read_aniso = True 373 _anisof.append(line.strip()) 374 elif read_atom and line.startswith("#"): 375 read_atom = False 376 elif read_aniso and line.startswith("#"): 377 read_aniso = False 378 elif read_atom: 379 _records.append(line.strip()) 380 elif read_aniso: 381 _anisors.append(line.strip()) 382 383 # Dumping the shlex module here since this particular 384 # category should be rather straightforward. 385 # Quite a performance boost.. 386 _record_tbl = zip(*map(str.split, _records)) 387 _anisob_tbl = zip(*map(str.split, _anisors)) 388 389 mmcif_dict = dict(zip(_fields, _record_tbl)) 390 mmcif_dict.update(dict(zip(_anisof, _anisob_tbl))) 391 392 # Build structure object 393 atom_serial_list = mmcif_dict["_atom_site.id"] 394 atom_id_list = mmcif_dict["_atom_site.label_atom_id"] 395 residue_id_list = mmcif_dict["_atom_site.label_comp_id"] 396 397 try: 398 element_list = mmcif_dict["_atom_site.type_symbol"] 399 except KeyError: 400 element_list = None 401 402 chain_id_list = mmcif_dict["_atom_site.auth_asym_id"] 403 404 x_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_x"]] 405 y_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_y"]] 406 z_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_z"]] 407 alt_list = mmcif_dict["_atom_site.label_alt_id"] 408 icode_list = mmcif_dict["_atom_site.pdbx_PDB_ins_code"] 409 b_factor_list = mmcif_dict["_atom_site.B_iso_or_equiv"] 410 occupancy_list = mmcif_dict["_atom_site.occupancy"] 411 fieldname_list = mmcif_dict["_atom_site.group_PDB"] 412 413 try: 414 serial_list = [int(n) for n in mmcif_dict["_atom_site.pdbx_PDB_model_num"]] 415 except KeyError: 416 # No model number column 417 serial_list = None 418 except ValueError: 419 # Invalid model number (malformed file) 420 raise PDBConstructionException("Invalid model number") from None 421 422 try: 423 aniso_u11 = mmcif_dict["_atom_site_anisotrop.U[1][1]"] 424 aniso_u12 = mmcif_dict["_atom_site_anisotrop.U[1][2]"] 425 aniso_u13 = mmcif_dict["_atom_site_anisotrop.U[1][3]"] 426 aniso_u22 = mmcif_dict["_atom_site_anisotrop.U[2][2]"] 427 aniso_u23 = mmcif_dict["_atom_site_anisotrop.U[2][3]"] 428 aniso_u33 = mmcif_dict["_atom_site_anisotrop.U[3][3]"] 429 aniso_flag = 1 430 except KeyError: 431 # no anisotropic B factors 432 aniso_flag = 0 433 434 # if auth_seq_id is present, we use this. 435 # Otherwise label_seq_id is used. 436 if "_atom_site.auth_seq_id" in mmcif_dict: 437 seq_id_list = mmcif_dict["_atom_site.auth_seq_id"] 438 else: 439 seq_id_list = mmcif_dict["_atom_site.label_seq_id"] 440 441 # Now loop over atoms and build the structure 442 current_chain_id = None 443 current_residue_id = None 444 current_resname = None 445 structure_builder = self._structure_builder 446 structure_builder.init_structure(structure_id) 447 structure_builder.init_seg(" ") 448 449 # Historically, Biopython PDB parser uses model_id to mean array index 450 # so serial_id means the Model ID specified in the file 451 current_model_id = -1 452 current_serial_id = -1 453 for i in range(0, len(atom_id_list)): 454 455 # set the line_counter for 'ATOM' lines only and not 456 # as a global line counter found in the PDBParser() 457 structure_builder.set_line_counter(i) 458 459 serial = atom_serial_list[i] 460 461 x = x_list[i] 462 y = y_list[i] 463 z = z_list[i] 464 resname = residue_id_list[i] 465 chainid = chain_id_list[i] 466 altloc = alt_list[i] 467 if altloc in _unassigned: 468 altloc = " " 469 int_resseq = int(seq_id_list[i]) 470 icode = icode_list[i] 471 if icode in _unassigned: 472 icode = " " 473 # Remove occasional " from quoted atom names (e.g. xNA) 474 name = atom_id_list[i].strip('"') 475 476 # occupancy & B factor 477 try: 478 tempfactor = float(b_factor_list[i]) 479 except ValueError: 480 raise PDBConstructionException("Invalid or missing B factor") from None 481 482 try: 483 occupancy = float(occupancy_list[i]) 484 except ValueError: 485 raise PDBConstructionException("Invalid or missing occupancy") from None 486 487 fieldname = fieldname_list[i] 488 if fieldname == "HETATM": 489 hetatm_flag = "H" 490 else: 491 hetatm_flag = " " 492 493 resseq = (hetatm_flag, int_resseq, icode) 494 495 if serial_list is not None: 496 # model column exists; use it 497 serial_id = serial_list[i] 498 if current_serial_id != serial_id: 499 # if serial changes, update it and start new model 500 current_serial_id = serial_id 501 current_model_id += 1 502 structure_builder.init_model(current_model_id, current_serial_id) 503 current_chain_id = None 504 current_residue_id = None 505 current_resname = None 506 else: 507 # no explicit model column; initialize single model 508 structure_builder.init_model(current_model_id) 509 510 if current_chain_id != chainid: 511 current_chain_id = chainid 512 structure_builder.init_chain(current_chain_id) 513 current_residue_id = None 514 current_resname = None 515 516 if current_residue_id != resseq or current_resname != resname: 517 current_residue_id = resseq 518 current_resname = resname 519 structure_builder.init_residue(resname, hetatm_flag, int_resseq, icode) 520 521 coord = numpy.array((x, y, z), "f") 522 element = element_list[i] if element_list else None 523 structure_builder.init_atom( 524 name, 525 coord, 526 tempfactor, 527 occupancy, 528 altloc, 529 name, 530 serial_number=serial, 531 element=element, 532 ) 533 if aniso_flag == 1 and i < len(aniso_u11): 534 u = ( 535 aniso_u11[i], 536 aniso_u12[i], 537 aniso_u13[i], 538 aniso_u22[i], 539 aniso_u23[i], 540 aniso_u33[i], 541 ) 542 mapped_anisou = [float(_) for _ in u] 543 anisou_array = numpy.array(mapped_anisou, "f") 544 structure_builder.set_anisou(anisou_array) 545