1# Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk)
2# This code is part of the Biopython distribution and governed by its
3# license.  Please see the LICENSE file that should have been included
4# as part of this package.
5
6"""Parser for PDB files."""
7
8
9import warnings
10
11try:
12    import numpy
13except ImportError:
14    from Bio import MissingPythonDependencyError
15
16    raise MissingPythonDependencyError(
17        "Install NumPy if you want to use the PDB parser."
18    ) from None
19
20from Bio.File import as_handle
21
22from Bio.PDB.PDBExceptions import PDBConstructionException
23from Bio.PDB.PDBExceptions import PDBConstructionWarning
24
25from Bio.PDB.StructureBuilder import StructureBuilder
26from Bio.PDB.parse_pdb_header import _parse_pdb_header_list
27
28
29# If PDB spec says "COLUMNS 18-20" this means line[17:20]
30
31
32class PDBParser:
33    """Parse a PDB file and return a Structure object."""
34
35    def __init__(
36        self,
37        PERMISSIVE=True,
38        get_header=False,
39        structure_builder=None,
40        QUIET=False,
41        is_pqr=False,
42    ):
43        """Create a PDBParser object.
44
45        The PDB parser call a number of standard methods in an aggregated
46        StructureBuilder object. Normally this object is instanciated by the
47        PDBParser object itself, but if the user provides his/her own
48        StructureBuilder object, the latter is used instead.
49
50        Arguments:
51         - PERMISSIVE - Evaluated as a Boolean. If false, exceptions in
52           constructing the SMCRA data structure are fatal. If true (DEFAULT),
53           the exceptions are caught, but some residues or atoms will be missing.
54           THESE EXCEPTIONS ARE DUE TO PROBLEMS IN THE PDB FILE!.
55         - get_header - unused argument kept for historical compatibilty.
56         - structure_builder - an optional user implemented StructureBuilder class.
57         - QUIET - Evaluated as a Boolean. If true, warnings issued in constructing
58           the SMCRA data will be suppressed. If false (DEFAULT), they will be shown.
59           These warnings might be indicative of problems in the PDB file!
60         - is_pqr - Evaluated as a Boolean. Specifies the type of file to be parsed.
61           If false (DEFAULT) a .pdb file format is assumed. Set it to true if you
62           want to parse a .pqr file instead.
63
64        """
65        # get_header is not used but is left in for API compatibility
66        if structure_builder is not None:
67            self.structure_builder = structure_builder
68        else:
69            self.structure_builder = StructureBuilder()
70        self.header = None
71        self.trailer = None
72        self.line_counter = 0
73        self.PERMISSIVE = bool(PERMISSIVE)
74        self.QUIET = bool(QUIET)
75        self.is_pqr = bool(is_pqr)
76
77    # Public methods
78
79    def get_structure(self, id, file):
80        """Return the structure.
81
82        Arguments:
83         - id - string, the id that will be used for the structure
84         - file - name of the PDB file OR an open filehandle
85
86        """
87        with warnings.catch_warnings():
88            if self.QUIET:
89                warnings.filterwarnings("ignore", category=PDBConstructionWarning)
90
91            self.header = None
92            self.trailer = None
93            # Make a StructureBuilder instance (pass id of structure as parameter)
94            self.structure_builder.init_structure(id)
95
96            with as_handle(file) as handle:
97                lines = handle.readlines()
98                if not lines:
99                    raise ValueError("Empty file.")
100                self._parse(lines)
101
102            self.structure_builder.set_header(self.header)
103            # Return the Structure instance
104            structure = self.structure_builder.get_structure()
105
106        return structure
107
108    def get_header(self):
109        """Return the header."""
110        return self.header
111
112    def get_trailer(self):
113        """Return the trailer."""
114        return self.trailer
115
116    # Private methods
117
118    def _parse(self, header_coords_trailer):
119        """Parse the PDB file (PRIVATE)."""
120        # Extract the header; return the rest of the file
121        self.header, coords_trailer = self._get_header(header_coords_trailer)
122        # Parse the atomic data; return the PDB file trailer
123        self.trailer = self._parse_coordinates(coords_trailer)
124
125    def _get_header(self, header_coords_trailer):
126        """Get the header of the PDB file, return the rest (PRIVATE)."""
127        structure_builder = self.structure_builder
128        i = 0
129        for i in range(0, len(header_coords_trailer)):
130            structure_builder.set_line_counter(i + 1)
131            line = header_coords_trailer[i]
132            record_type = line[0:6]
133            if record_type in ("ATOM  ", "HETATM", "MODEL "):
134                break
135        header = header_coords_trailer[0:i]
136        # Return the rest of the coords+trailer for further processing
137        self.line_counter = i
138        coords_trailer = header_coords_trailer[i:]
139        header_dict = _parse_pdb_header_list(header)
140        return header_dict, coords_trailer
141
142    def _parse_coordinates(self, coords_trailer):
143        """Parse the atomic data in the PDB file (PRIVATE)."""
144        allowed_records = {
145            "ATOM  ",
146            "HETATM",
147            "MODEL ",
148            "ENDMDL",
149            "TER   ",
150            "ANISOU",
151            # These are older 2.3 format specs:
152            "SIGATM",
153            "SIGUIJ",
154            # bookkeeping records after coordinates:
155            "MASTER",
156        }
157
158        local_line_counter = 0
159        structure_builder = self.structure_builder
160        current_model_id = 0
161        # Flag we have an open model
162        model_open = 0
163        current_chain_id = None
164        current_segid = None
165        current_residue_id = None
166        current_resname = None
167
168        for i in range(0, len(coords_trailer)):
169            line = coords_trailer[i].rstrip("\n")
170            record_type = line[0:6]
171            global_line_counter = self.line_counter + local_line_counter + 1
172            structure_builder.set_line_counter(global_line_counter)
173            if not line.strip():
174                continue  # skip empty lines
175            elif record_type == "ATOM  " or record_type == "HETATM":
176                # Initialize the Model - there was no explicit MODEL record
177                if not model_open:
178                    structure_builder.init_model(current_model_id)
179                    current_model_id += 1
180                    model_open = 1
181                fullname = line[12:16]
182                # get rid of whitespace in atom names
183                split_list = fullname.split()
184                if len(split_list) != 1:
185                    # atom name has internal spaces, e.g. " N B ", so
186                    # we do not strip spaces
187                    name = fullname
188                else:
189                    # atom name is like " CA ", so we can strip spaces
190                    name = split_list[0]
191                altloc = line[16]
192                resname = line[17:20].strip()
193                chainid = line[21]
194                try:
195                    serial_number = int(line[6:11])
196                except Exception:
197                    serial_number = 0
198                resseq = int(line[22:26].split()[0])  # sequence identifier
199                icode = line[26]  # insertion code
200                if record_type == "HETATM":  # hetero atom flag
201                    if resname == "HOH" or resname == "WAT":
202                        hetero_flag = "W"
203                    else:
204                        hetero_flag = "H"
205                else:
206                    hetero_flag = " "
207                residue_id = (hetero_flag, resseq, icode)
208                # atomic coordinates
209                try:
210                    x = float(line[30:38])
211                    y = float(line[38:46])
212                    z = float(line[46:54])
213                except Exception:
214                    # Should we allow parsing to continue in permissive mode?
215                    # If so, what coordinates should we default to?  Easier to abort!
216                    raise PDBConstructionException(
217                        "Invalid or missing coordinate(s) at line %i."
218                        % global_line_counter
219                    ) from None
220                coord = numpy.array((x, y, z), "f")
221
222                # occupancy & B factor
223                if not self.is_pqr:
224                    try:
225                        occupancy = float(line[54:60])
226                    except Exception:
227                        self._handle_PDB_exception(
228                            "Invalid or missing occupancy", global_line_counter
229                        )
230                        occupancy = None  # Rather than arbitrary zero or one
231                    if occupancy is not None and occupancy < 0:
232                        # TODO - Should this be an error in strict mode?
233                        # self._handle_PDB_exception("Negative occupancy",
234                        #                            global_line_counter)
235                        # This uses fixed text so the warning occurs once only:
236                        warnings.warn(
237                            "Negative occupancy in one or more atoms",
238                            PDBConstructionWarning,
239                        )
240                    try:
241                        bfactor = float(line[60:66])
242                    except Exception:
243                        self._handle_PDB_exception(
244                            "Invalid or missing B factor", global_line_counter
245                        )
246                        bfactor = 0.0  # PDB uses a default of zero if missing
247
248                elif self.is_pqr:
249                    # Attempt to parse charge and radius fields
250                    try:
251                        pqr_charge = float(line[54:62])
252                    except Exception:
253                        self._handle_PDB_exception(
254                            "Invalid or missing charge", global_line_counter
255                        )
256                        pqr_charge = None  # Rather than arbitrary zero or one
257                    try:
258                        radius = float(line[62:70])
259                    except Exception:
260                        self._handle_PDB_exception(
261                            "Invalid or missing radius", global_line_counter
262                        )
263                        radius = None
264                    if radius is not None and radius < 0:
265                        # In permissive mode raise fatal exception.
266                        message = "Negative atom radius"
267                        self._handle_PDB_exception(message, global_line_counter)
268                        radius = None
269
270                segid = line[72:76]
271                element = line[76:78].strip().upper()
272                if current_segid != segid:
273                    current_segid = segid
274                    structure_builder.init_seg(current_segid)
275                if current_chain_id != chainid:
276                    current_chain_id = chainid
277                    structure_builder.init_chain(current_chain_id)
278                    current_residue_id = residue_id
279                    current_resname = resname
280                    try:
281                        structure_builder.init_residue(
282                            resname, hetero_flag, resseq, icode
283                        )
284                    except PDBConstructionException as message:
285                        self._handle_PDB_exception(message, global_line_counter)
286                elif current_residue_id != residue_id or current_resname != resname:
287                    current_residue_id = residue_id
288                    current_resname = resname
289                    try:
290                        structure_builder.init_residue(
291                            resname, hetero_flag, resseq, icode
292                        )
293                    except PDBConstructionException as message:
294                        self._handle_PDB_exception(message, global_line_counter)
295
296                if not self.is_pqr:
297                    # init atom with pdb fields
298                    try:
299                        structure_builder.init_atom(
300                            name,
301                            coord,
302                            bfactor,
303                            occupancy,
304                            altloc,
305                            fullname,
306                            serial_number,
307                            element,
308                        )
309                    except PDBConstructionException as message:
310                        self._handle_PDB_exception(message, global_line_counter)
311                elif self.is_pqr:
312                    try:
313                        structure_builder.init_atom(
314                            name,
315                            coord,
316                            pqr_charge,
317                            radius,
318                            altloc,
319                            fullname,
320                            serial_number,
321                            element,
322                            pqr_charge,
323                            radius,
324                            self.is_pqr,
325                        )
326                    except PDBConstructionException as message:
327                        self._handle_PDB_exception(message, global_line_counter)
328            elif record_type == "ANISOU":
329                anisou = [
330                    float(x)
331                    for x in (
332                        line[28:35],
333                        line[35:42],
334                        line[43:49],
335                        line[49:56],
336                        line[56:63],
337                        line[63:70],
338                    )
339                ]
340                # U's are scaled by 10^4
341                anisou_array = (numpy.array(anisou, "f") / 10000.0).astype("f")
342                structure_builder.set_anisou(anisou_array)
343            elif record_type == "MODEL ":
344                try:
345                    serial_num = int(line[10:14])
346                except Exception:
347                    self._handle_PDB_exception(
348                        "Invalid or missing model serial number", global_line_counter
349                    )
350                    serial_num = 0
351                structure_builder.init_model(current_model_id, serial_num)
352                current_model_id += 1
353                model_open = 1
354                current_chain_id = None
355                current_residue_id = None
356            elif record_type == "END   " or record_type == "CONECT":
357                # End of atomic data, return the trailer
358                self.line_counter += local_line_counter
359                return coords_trailer[local_line_counter:]
360            elif record_type == "ENDMDL":
361                model_open = 0
362                current_chain_id = None
363                current_residue_id = None
364            elif record_type == "SIGUIJ":
365                # standard deviation of anisotropic B factor
366                siguij = [
367                    float(x)
368                    for x in (
369                        line[28:35],
370                        line[35:42],
371                        line[42:49],
372                        line[49:56],
373                        line[56:63],
374                        line[63:70],
375                    )
376                ]
377                # U sigma's are scaled by 10^4
378                siguij_array = (numpy.array(siguij, "f") / 10000.0).astype("f")
379                structure_builder.set_siguij(siguij_array)
380            elif record_type == "SIGATM":
381                # standard deviation of atomic positions
382                sigatm = [
383                    float(x)
384                    for x in (
385                        line[30:38],
386                        line[38:46],
387                        line[46:54],
388                        line[54:60],
389                        line[60:66],
390                    )
391                ]
392                sigatm_array = numpy.array(sigatm, "f")
393                structure_builder.set_sigatm(sigatm_array)
394            elif record_type not in allowed_records:
395                warnings.warn(
396                    "Ignoring unrecognized record '{}' at line {}".format(
397                        record_type, global_line_counter
398                    ),
399                    PDBConstructionWarning,
400                )
401            local_line_counter += 1
402        # EOF (does not end in END or CONECT)
403        self.line_counter = self.line_counter + local_line_counter
404        return []
405
406    def _handle_PDB_exception(self, message, line_counter):
407        """Handle exception (PRIVATE).
408
409        This method catches an exception that occurs in the StructureBuilder
410        object (if PERMISSIVE), or raises it again, this time adding the
411        PDB line number to the error message.
412        """
413        message = "%s at line %i." % (message, line_counter)
414        if self.PERMISSIVE:
415            # just print a warning - some residues/atoms may be missing
416            warnings.warn(
417                "PDBConstructionException: %s\n"
418                "Exception ignored.\n"
419                "Some atoms or residues may be missing in the data structure."
420                % message,
421                PDBConstructionWarning,
422            )
423        else:
424            # exceptions are fatal - raise again with new message (including line nr)
425            raise PDBConstructionException(message) from None
426