1# Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk)
2# This code is part of the Biopython distribution and governed by its
3# license.  Please see the LICENSE file that should have been included
4# as part of this package.
5
6"""mmCIF parsers."""
7
8
9import numpy
10import warnings
11
12from Bio.File import as_handle
13
14from Bio.PDB.MMCIF2Dict import MMCIF2Dict
15from Bio.PDB.StructureBuilder import StructureBuilder
16from Bio.PDB.PDBExceptions import PDBConstructionException
17from Bio.PDB.PDBExceptions import PDBConstructionWarning
18
19
20class MMCIFParser:
21    """Parse a mmCIF file and return a Structure object."""
22
23    def __init__(self, structure_builder=None, QUIET=False):
24        """Create a PDBParser object.
25
26        The mmCIF parser calls a number of standard methods in an aggregated
27        StructureBuilder object. Normally this object is instanciated by the
28        MMCIParser object itself, but if the user provides his/her own
29        StructureBuilder object, the latter is used instead.
30
31        Arguments:
32         - structure_builder - an optional user implemented StructureBuilder class.
33         - QUIET - Evaluated as a Boolean. If true, warnings issued in constructing
34           the SMCRA data will be suppressed. If false (DEFAULT), they will be shown.
35           These warnings might be indicative of problems in the mmCIF file!
36
37        """
38        if structure_builder is not None:
39            self._structure_builder = structure_builder
40        else:
41            self._structure_builder = StructureBuilder()
42        self.header = None
43        # self.trailer = None
44        self.line_counter = 0
45        self.build_structure = None
46        self.QUIET = bool(QUIET)
47
48    # Public methods
49
50    def get_structure(self, structure_id, filename):
51        """Return the structure.
52
53        Arguments:
54         - structure_id - string, the id that will be used for the structure
55         - filename - name of mmCIF file, OR an open text mode file handle
56
57        """
58        with warnings.catch_warnings():
59            if self.QUIET:
60                warnings.filterwarnings("ignore", category=PDBConstructionWarning)
61            self._mmcif_dict = MMCIF2Dict(filename)
62            self._build_structure(structure_id)
63            self._structure_builder.set_header(self._get_header())
64
65        return self._structure_builder.get_structure()
66
67    # Private methods
68
69    def _mmcif_get(self, key, dict, deflt):
70        if key in dict:
71            rslt = dict[key][0]
72            if "?" != rslt:
73                return rslt
74        return deflt
75
76    def _update_header_entry(self, target_key, keys):
77        md = self._mmcif_dict
78        for key in keys:
79            val = md.get(key)
80            try:
81                item = val[0]
82            except (TypeError, IndexError):
83                continue
84            if item != "?":
85                self.header[target_key] = item
86                break
87
88    def _get_header(self):
89        self.header = {
90            "name": "",
91            "head": "",
92            "idcode": "",
93            "deposition_date": "",
94            "structure_method": "",
95            "resolution": None,
96        }
97
98        self._update_header_entry(
99            "idcode", ["_entry_id", "_exptl.entry_id", "_struct.entry_id"]
100        )
101        self._update_header_entry("name", ["_struct.title"])
102        self._update_header_entry(
103            "head", ["_struct_keywords.pdbx_keywords", "_struct_keywords.text"]
104        )
105        self._update_header_entry(
106            "deposition_date", ["_pdbx_database_status.recvd_initial_deposition_date"]
107        )
108        self._update_header_entry("structure_method", ["_exptl.method"])
109        self._update_header_entry(
110            "resolution",
111            [
112                "_refine.ls_d_res_high",
113                "_refine_hist.d_res_high",
114                "_em_3d_reconstruction.resolution",
115            ],
116        )
117        if self.header["resolution"] is not None:
118            try:
119                self.header["resolution"] = float(self.header["resolution"])
120            except ValueError:
121                self.header["resolution"] = None
122
123        return self.header
124
125    def _build_structure(self, structure_id):
126
127        # two special chars as placeholders in the mmCIF format
128        # for item values that cannot be explicitly assigned
129        # see: pdbx/mmcif syntax web page
130        _unassigned = {".", "?"}
131
132        mmcif_dict = self._mmcif_dict
133
134        atom_serial_list = mmcif_dict["_atom_site.id"]
135        atom_id_list = mmcif_dict["_atom_site.label_atom_id"]
136        residue_id_list = mmcif_dict["_atom_site.label_comp_id"]
137        try:
138            element_list = mmcif_dict["_atom_site.type_symbol"]
139        except KeyError:
140            element_list = None
141        chain_id_list = mmcif_dict["_atom_site.auth_asym_id"]
142        x_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_x"]]
143        y_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_y"]]
144        z_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_z"]]
145        alt_list = mmcif_dict["_atom_site.label_alt_id"]
146        icode_list = mmcif_dict["_atom_site.pdbx_PDB_ins_code"]
147        b_factor_list = mmcif_dict["_atom_site.B_iso_or_equiv"]
148        occupancy_list = mmcif_dict["_atom_site.occupancy"]
149        fieldname_list = mmcif_dict["_atom_site.group_PDB"]
150        try:
151            serial_list = [int(n) for n in mmcif_dict["_atom_site.pdbx_PDB_model_num"]]
152        except KeyError:
153            # No model number column
154            serial_list = None
155        except ValueError:
156            # Invalid model number (malformed file)
157            raise PDBConstructionException("Invalid model number") from None
158        try:
159            aniso_u11 = mmcif_dict["_atom_site_anisotrop.U[1][1]"]
160            aniso_u12 = mmcif_dict["_atom_site_anisotrop.U[1][2]"]
161            aniso_u13 = mmcif_dict["_atom_site_anisotrop.U[1][3]"]
162            aniso_u22 = mmcif_dict["_atom_site_anisotrop.U[2][2]"]
163            aniso_u23 = mmcif_dict["_atom_site_anisotrop.U[2][3]"]
164            aniso_u33 = mmcif_dict["_atom_site_anisotrop.U[3][3]"]
165            aniso_flag = 1
166        except KeyError:
167            # no anisotropic B factors
168            aniso_flag = 0
169        # if auth_seq_id is present, we use this.
170        # Otherwise label_seq_id is used.
171        if "_atom_site.auth_seq_id" in mmcif_dict:
172            seq_id_list = mmcif_dict["_atom_site.auth_seq_id"]
173        else:
174            seq_id_list = mmcif_dict["_atom_site.label_seq_id"]
175        # Now loop over atoms and build the structure
176        current_chain_id = None
177        current_residue_id = None
178        current_resname = None
179        structure_builder = self._structure_builder
180        structure_builder.init_structure(structure_id)
181        structure_builder.init_seg(" ")
182        # Historically, Biopython PDB parser uses model_id to mean array index
183        # so serial_id means the Model ID specified in the file
184        current_model_id = -1
185        current_serial_id = -1
186        for i in range(0, len(atom_id_list)):
187
188            # set the line_counter for 'ATOM' lines only and not
189            # as a global line counter found in the PDBParser()
190            structure_builder.set_line_counter(i)
191
192            # Try coercing serial to int, for compatibility with PDBParser
193            # But do not quit if it fails. mmCIF format specs allow strings.
194            try:
195                serial = int(atom_serial_list[i])
196            except ValueError:
197                serial = atom_serial_list[i]
198                warnings.warn(
199                    "PDBConstructionWarning: "
200                    "Some atom serial numbers are not numerical",
201                    PDBConstructionWarning,
202                )
203
204            x = x_list[i]
205            y = y_list[i]
206            z = z_list[i]
207            resname = residue_id_list[i]
208            chainid = chain_id_list[i]
209            altloc = alt_list[i]
210            if altloc in _unassigned:
211                altloc = " "
212            int_resseq = int(seq_id_list[i])
213            icode = icode_list[i]
214            if icode in _unassigned:
215                icode = " "
216            name = atom_id_list[i]
217            # occupancy & B factor
218            try:
219                tempfactor = float(b_factor_list[i])
220            except ValueError:
221                raise PDBConstructionException("Invalid or missing B factor") from None
222            try:
223                occupancy = float(occupancy_list[i])
224            except ValueError:
225                raise PDBConstructionException("Invalid or missing occupancy") from None
226            fieldname = fieldname_list[i]
227            if fieldname == "HETATM":
228                if resname == "HOH" or resname == "WAT":
229                    hetatm_flag = "W"
230                else:
231                    hetatm_flag = "H"
232            else:
233                hetatm_flag = " "
234
235            resseq = (hetatm_flag, int_resseq, icode)
236
237            if serial_list is not None:
238                # model column exists; use it
239                serial_id = serial_list[i]
240                if current_serial_id != serial_id:
241                    # if serial changes, update it and start new model
242                    current_serial_id = serial_id
243                    current_model_id += 1
244                    structure_builder.init_model(current_model_id, current_serial_id)
245                    current_chain_id = None
246                    current_residue_id = None
247                    current_resname = None
248            else:
249                # no explicit model column; initialize single model
250                structure_builder.init_model(current_model_id)
251
252            if current_chain_id != chainid:
253                current_chain_id = chainid
254                structure_builder.init_chain(current_chain_id)
255                current_residue_id = None
256                current_resname = None
257
258            if current_residue_id != resseq or current_resname != resname:
259                current_residue_id = resseq
260                current_resname = resname
261                structure_builder.init_residue(resname, hetatm_flag, int_resseq, icode)
262
263            coord = numpy.array((x, y, z), "f")
264            element = element_list[i].upper() if element_list else None
265            structure_builder.init_atom(
266                name,
267                coord,
268                tempfactor,
269                occupancy,
270                altloc,
271                name,
272                serial_number=serial,
273                element=element,
274            )
275            if aniso_flag == 1 and i < len(aniso_u11):
276                u = (
277                    aniso_u11[i],
278                    aniso_u12[i],
279                    aniso_u13[i],
280                    aniso_u22[i],
281                    aniso_u23[i],
282                    aniso_u33[i],
283                )
284                mapped_anisou = [float(_) for _ in u]
285                anisou_array = numpy.array(mapped_anisou, "f")
286                structure_builder.set_anisou(anisou_array)
287        # Now try to set the cell
288        try:
289            a = float(mmcif_dict["_cell.length_a"][0])
290            b = float(mmcif_dict["_cell.length_b"][0])
291            c = float(mmcif_dict["_cell.length_c"][0])
292            alpha = float(mmcif_dict["_cell.angle_alpha"][0])
293            beta = float(mmcif_dict["_cell.angle_beta"][0])
294            gamma = float(mmcif_dict["_cell.angle_gamma"][0])
295            cell = numpy.array((a, b, c, alpha, beta, gamma), "f")
296            spacegroup = mmcif_dict["_symmetry.space_group_name_H-M"][0]
297            spacegroup = spacegroup[1:-1]  # get rid of quotes!!
298            if spacegroup is None:
299                raise Exception
300            structure_builder.set_symmetry(spacegroup, cell)
301        except Exception:
302            pass  # no cell found, so just ignore
303
304
305class FastMMCIFParser:
306    """Parse an MMCIF file and return a Structure object."""
307
308    def __init__(self, structure_builder=None, QUIET=False):
309        """Create a FastMMCIFParser object.
310
311        The mmCIF parser calls a number of standard methods in an aggregated
312        StructureBuilder object. Normally this object is instanciated by the
313        parser object itself, but if the user provides his/her own
314        StructureBuilder object, the latter is used instead.
315
316        The main difference between this class and the regular MMCIFParser is
317        that only 'ATOM' and 'HETATM' lines are parsed here. Use if you are
318        interested only in coordinate information.
319
320        Arguments:
321         - structure_builder - an optional user implemented StructureBuilder class.
322         - QUIET - Evaluated as a Boolean. If true, warnings issued in constructing
323           the SMCRA data will be suppressed. If false (DEFAULT), they will be shown.
324           These warnings might be indicative of problems in the mmCIF file!
325
326        """
327        if structure_builder is not None:
328            self._structure_builder = structure_builder
329        else:
330            self._structure_builder = StructureBuilder()
331
332        self.line_counter = 0
333        self.build_structure = None
334        self.QUIET = bool(QUIET)
335
336    # Public methods
337
338    def get_structure(self, structure_id, filename):
339        """Return the structure.
340
341        Arguments:
342         - structure_id - string, the id that will be used for the structure
343         - filename - name of the mmCIF file OR an open filehandle
344
345        """
346        with warnings.catch_warnings():
347            if self.QUIET:
348                warnings.filterwarnings("ignore", category=PDBConstructionWarning)
349            with as_handle(filename) as handle:
350                self._build_structure(structure_id, handle)
351
352        return self._structure_builder.get_structure()
353
354    # Private methods
355
356    def _build_structure(self, structure_id, filehandle):
357
358        # two special chars as placeholders in the mmCIF format
359        # for item values that cannot be explicitly assigned
360        # see: pdbx/mmcif syntax web page
361        _unassigned = {".", "?"}
362
363        # Read only _atom_site. and atom_site_anisotrop entries
364        read_atom, read_aniso = False, False
365        _fields, _records = [], []
366        _anisof, _anisors = [], []
367        for line in filehandle:
368            if line.startswith("_atom_site."):
369                read_atom = True
370                _fields.append(line.strip())
371            elif line.startswith("_atom_site_anisotrop."):
372                read_aniso = True
373                _anisof.append(line.strip())
374            elif read_atom and line.startswith("#"):
375                read_atom = False
376            elif read_aniso and line.startswith("#"):
377                read_aniso = False
378            elif read_atom:
379                _records.append(line.strip())
380            elif read_aniso:
381                _anisors.append(line.strip())
382
383        # Dumping the shlex module here since this particular
384        # category should be rather straightforward.
385        # Quite a performance boost..
386        _record_tbl = zip(*map(str.split, _records))
387        _anisob_tbl = zip(*map(str.split, _anisors))
388
389        mmcif_dict = dict(zip(_fields, _record_tbl))
390        mmcif_dict.update(dict(zip(_anisof, _anisob_tbl)))
391
392        # Build structure object
393        atom_serial_list = mmcif_dict["_atom_site.id"]
394        atom_id_list = mmcif_dict["_atom_site.label_atom_id"]
395        residue_id_list = mmcif_dict["_atom_site.label_comp_id"]
396
397        try:
398            element_list = mmcif_dict["_atom_site.type_symbol"]
399        except KeyError:
400            element_list = None
401
402        chain_id_list = mmcif_dict["_atom_site.auth_asym_id"]
403
404        x_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_x"]]
405        y_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_y"]]
406        z_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_z"]]
407        alt_list = mmcif_dict["_atom_site.label_alt_id"]
408        icode_list = mmcif_dict["_atom_site.pdbx_PDB_ins_code"]
409        b_factor_list = mmcif_dict["_atom_site.B_iso_or_equiv"]
410        occupancy_list = mmcif_dict["_atom_site.occupancy"]
411        fieldname_list = mmcif_dict["_atom_site.group_PDB"]
412
413        try:
414            serial_list = [int(n) for n in mmcif_dict["_atom_site.pdbx_PDB_model_num"]]
415        except KeyError:
416            # No model number column
417            serial_list = None
418        except ValueError:
419            # Invalid model number (malformed file)
420            raise PDBConstructionException("Invalid model number") from None
421
422        try:
423            aniso_u11 = mmcif_dict["_atom_site_anisotrop.U[1][1]"]
424            aniso_u12 = mmcif_dict["_atom_site_anisotrop.U[1][2]"]
425            aniso_u13 = mmcif_dict["_atom_site_anisotrop.U[1][3]"]
426            aniso_u22 = mmcif_dict["_atom_site_anisotrop.U[2][2]"]
427            aniso_u23 = mmcif_dict["_atom_site_anisotrop.U[2][3]"]
428            aniso_u33 = mmcif_dict["_atom_site_anisotrop.U[3][3]"]
429            aniso_flag = 1
430        except KeyError:
431            # no anisotropic B factors
432            aniso_flag = 0
433
434        # if auth_seq_id is present, we use this.
435        # Otherwise label_seq_id is used.
436        if "_atom_site.auth_seq_id" in mmcif_dict:
437            seq_id_list = mmcif_dict["_atom_site.auth_seq_id"]
438        else:
439            seq_id_list = mmcif_dict["_atom_site.label_seq_id"]
440
441        # Now loop over atoms and build the structure
442        current_chain_id = None
443        current_residue_id = None
444        current_resname = None
445        structure_builder = self._structure_builder
446        structure_builder.init_structure(structure_id)
447        structure_builder.init_seg(" ")
448
449        # Historically, Biopython PDB parser uses model_id to mean array index
450        # so serial_id means the Model ID specified in the file
451        current_model_id = -1
452        current_serial_id = -1
453        for i in range(0, len(atom_id_list)):
454
455            # set the line_counter for 'ATOM' lines only and not
456            # as a global line counter found in the PDBParser()
457            structure_builder.set_line_counter(i)
458
459            serial = atom_serial_list[i]
460
461            x = x_list[i]
462            y = y_list[i]
463            z = z_list[i]
464            resname = residue_id_list[i]
465            chainid = chain_id_list[i]
466            altloc = alt_list[i]
467            if altloc in _unassigned:
468                altloc = " "
469            int_resseq = int(seq_id_list[i])
470            icode = icode_list[i]
471            if icode in _unassigned:
472                icode = " "
473            # Remove occasional " from quoted atom names (e.g. xNA)
474            name = atom_id_list[i].strip('"')
475
476            # occupancy & B factor
477            try:
478                tempfactor = float(b_factor_list[i])
479            except ValueError:
480                raise PDBConstructionException("Invalid or missing B factor") from None
481
482            try:
483                occupancy = float(occupancy_list[i])
484            except ValueError:
485                raise PDBConstructionException("Invalid or missing occupancy") from None
486
487            fieldname = fieldname_list[i]
488            if fieldname == "HETATM":
489                hetatm_flag = "H"
490            else:
491                hetatm_flag = " "
492
493            resseq = (hetatm_flag, int_resseq, icode)
494
495            if serial_list is not None:
496                # model column exists; use it
497                serial_id = serial_list[i]
498                if current_serial_id != serial_id:
499                    # if serial changes, update it and start new model
500                    current_serial_id = serial_id
501                    current_model_id += 1
502                    structure_builder.init_model(current_model_id, current_serial_id)
503                    current_chain_id = None
504                    current_residue_id = None
505                    current_resname = None
506            else:
507                # no explicit model column; initialize single model
508                structure_builder.init_model(current_model_id)
509
510            if current_chain_id != chainid:
511                current_chain_id = chainid
512                structure_builder.init_chain(current_chain_id)
513                current_residue_id = None
514                current_resname = None
515
516            if current_residue_id != resseq or current_resname != resname:
517                current_residue_id = resseq
518                current_resname = resname
519                structure_builder.init_residue(resname, hetatm_flag, int_resseq, icode)
520
521            coord = numpy.array((x, y, z), "f")
522            element = element_list[i] if element_list else None
523            structure_builder.init_atom(
524                name,
525                coord,
526                tempfactor,
527                occupancy,
528                altloc,
529                name,
530                serial_number=serial,
531                element=element,
532            )
533            if aniso_flag == 1 and i < len(aniso_u11):
534                u = (
535                    aniso_u11[i],
536                    aniso_u12[i],
537                    aniso_u13[i],
538                    aniso_u22[i],
539                    aniso_u23[i],
540                    aniso_u33[i],
541                )
542                mapped_anisou = [float(_) for _ in u]
543                anisou_array = numpy.array(mapped_anisou, "f")
544                structure_builder.set_anisou(anisou_array)
545