1# Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk)
2#
3# This file is part of the Biopython distribution and governed by your
4# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
5# Please see the LICENSE file that should have been included as part of this
6# package.
7
8"""Turn an mmCIF file into a dictionary."""
9
10
11from Bio.File import as_handle
12
13
14class MMCIF2Dict(dict):
15    """Parse a mmCIF file and return a dictionary."""
16
17    def __init__(self, filename):
18        """Parse a mmCIF file and return a dictionary.
19
20        Arguments:
21         - file - name of the PDB file OR an open filehandle
22
23        """
24        self.quote_chars = ["'", '"']
25        self.whitespace_chars = [" ", "\t"]
26        with as_handle(filename) as handle:
27            loop_flag = False
28            key = None
29            tokens = self._tokenize(handle)
30            try:
31                token = next(tokens)
32            except StopIteration:
33                return  # for Python 3.7 and PEP 479
34            self[token[0:5]] = token[5:]
35            i = 0
36            n = 0
37            for token in tokens:
38                if token.lower() == "loop_":
39                    loop_flag = True
40                    keys = []
41                    i = 0
42                    n = 0
43                    continue
44                elif loop_flag:
45                    # The second condition checks we are in the first column
46                    # Some mmCIF files (e.g. 4q9r) have values in later columns
47                    # starting with an underscore and we don't want to read
48                    # these as keys
49                    if token.startswith("_") and (n == 0 or i % n == 0):
50                        if i > 0:
51                            loop_flag = False
52                        else:
53                            self[token] = []
54                            keys.append(token)
55                            n += 1
56                            continue
57                    else:
58                        self[keys[i % n]].append(token)
59                        i += 1
60                        continue
61                if key is None:
62                    key = token
63                else:
64                    self[key] = [token]
65                    key = None
66
67    # Private methods
68
69    def _splitline(self, line):
70        # See https://www.iucr.org/resources/cif/spec/version1.1/cifsyntax for the syntax
71        in_token = False
72        # quote character of the currently open quote, or None if no quote open
73        quote_open_char = None
74        start_i = 0
75        for (i, c) in enumerate(line):
76            if c in self.whitespace_chars:
77                if in_token and not quote_open_char:
78                    in_token = False
79                    yield line[start_i:i]
80            elif c in self.quote_chars:
81                if not quote_open_char and not in_token:
82                    quote_open_char = c
83                    in_token = True
84                    start_i = i + 1
85                elif c == quote_open_char and (
86                    i + 1 == len(line) or line[i + 1] in self.whitespace_chars
87                ):
88                    quote_open_char = None
89                    in_token = False
90                    yield line[start_i:i]
91            elif c == "#" and not in_token:
92                # Skip comments. "#" is a valid non-comment char inside of a
93                # quote and inside of an unquoted token (!?!?), so we need to
94                # check that the current char is not in a token.
95                return
96            elif not in_token:
97                in_token = True
98                start_i = i
99        if in_token:
100            yield line[start_i:]
101        if quote_open_char:
102            raise ValueError("Line ended with quote open: " + line)
103
104    def _tokenize(self, handle):
105        empty = True
106        for line in handle:
107            empty = False
108            if line.startswith("#"):
109                continue
110            elif line.startswith(";"):
111                # The spec says that leading whitespace on each line must be
112                # preserved while trailing whitespace may be stripped.  The
113                # trailing newline must be stripped.
114                token_buffer = [line[1:].rstrip()]
115                for line in handle:
116                    line = line.rstrip()
117                    if line.startswith(";"):
118                        yield "\n".join(token_buffer)
119                        line = line[1:]
120                        if line and not line[0] in self.whitespace_chars:
121                            raise ValueError("Missing whitespace")
122                        break
123                    token_buffer.append(line)
124                else:
125                    raise ValueError("Missing closing semicolon")
126            yield from self._splitline(line.strip())
127        if empty:
128            raise ValueError("Empty file.")
129