1# Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) 2# 3# This file is part of the Biopython distribution and governed by your 4# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 5# Please see the LICENSE file that should have been included as part of this 6# package. 7 8"""Turn an mmCIF file into a dictionary.""" 9 10 11from Bio.File import as_handle 12 13 14class MMCIF2Dict(dict): 15 """Parse a mmCIF file and return a dictionary.""" 16 17 def __init__(self, filename): 18 """Parse a mmCIF file and return a dictionary. 19 20 Arguments: 21 - file - name of the PDB file OR an open filehandle 22 23 """ 24 self.quote_chars = ["'", '"'] 25 self.whitespace_chars = [" ", "\t"] 26 with as_handle(filename) as handle: 27 loop_flag = False 28 key = None 29 tokens = self._tokenize(handle) 30 try: 31 token = next(tokens) 32 except StopIteration: 33 return # for Python 3.7 and PEP 479 34 self[token[0:5]] = token[5:] 35 i = 0 36 n = 0 37 for token in tokens: 38 if token.lower() == "loop_": 39 loop_flag = True 40 keys = [] 41 i = 0 42 n = 0 43 continue 44 elif loop_flag: 45 # The second condition checks we are in the first column 46 # Some mmCIF files (e.g. 4q9r) have values in later columns 47 # starting with an underscore and we don't want to read 48 # these as keys 49 if token.startswith("_") and (n == 0 or i % n == 0): 50 if i > 0: 51 loop_flag = False 52 else: 53 self[token] = [] 54 keys.append(token) 55 n += 1 56 continue 57 else: 58 self[keys[i % n]].append(token) 59 i += 1 60 continue 61 if key is None: 62 key = token 63 else: 64 self[key] = [token] 65 key = None 66 67 # Private methods 68 69 def _splitline(self, line): 70 # See https://www.iucr.org/resources/cif/spec/version1.1/cifsyntax for the syntax 71 in_token = False 72 # quote character of the currently open quote, or None if no quote open 73 quote_open_char = None 74 start_i = 0 75 for (i, c) in enumerate(line): 76 if c in self.whitespace_chars: 77 if in_token and not quote_open_char: 78 in_token = False 79 yield line[start_i:i] 80 elif c in self.quote_chars: 81 if not quote_open_char and not in_token: 82 quote_open_char = c 83 in_token = True 84 start_i = i + 1 85 elif c == quote_open_char and ( 86 i + 1 == len(line) or line[i + 1] in self.whitespace_chars 87 ): 88 quote_open_char = None 89 in_token = False 90 yield line[start_i:i] 91 elif c == "#" and not in_token: 92 # Skip comments. "#" is a valid non-comment char inside of a 93 # quote and inside of an unquoted token (!?!?), so we need to 94 # check that the current char is not in a token. 95 return 96 elif not in_token: 97 in_token = True 98 start_i = i 99 if in_token: 100 yield line[start_i:] 101 if quote_open_char: 102 raise ValueError("Line ended with quote open: " + line) 103 104 def _tokenize(self, handle): 105 empty = True 106 for line in handle: 107 empty = False 108 if line.startswith("#"): 109 continue 110 elif line.startswith(";"): 111 # The spec says that leading whitespace on each line must be 112 # preserved while trailing whitespace may be stripped. The 113 # trailing newline must be stripped. 114 token_buffer = [line[1:].rstrip()] 115 for line in handle: 116 line = line.rstrip() 117 if line.startswith(";"): 118 yield "\n".join(token_buffer) 119 line = line[1:] 120 if line and not line[0] in self.whitespace_chars: 121 raise ValueError("Missing whitespace") 122 break 123 token_buffer.append(line) 124 else: 125 raise ValueError("Missing closing semicolon") 126 yield from self._splitline(line.strip()) 127 if empty: 128 raise ValueError("Empty file.") 129