1# File download/unzip written 2012 by Lenna X. Peterson (arklenna@gmail.com)
2# Dictionary extraction written 2011 by Hongbo Zhu
3#
4# This code is part of the Biopython distribution and governed by its
5# license.  Please see the LICENSE file that should have been included
6# as part of this package.
7
8"""Download PDB Chemical Component Dictionary and generate dict.
9
10Download and parse PDB Chemical Component Dictionary,
11then write out dict for to_one_letter_code.
12"""
13
14
15import gzip
16import inspect
17import os
18import warnings
19
20from urllib.request import urlopen
21
22url = "ftp://ftp.wwpdb.org/pub/pdb/data/monomers/components.cif.gz"
23
24# extract name of gzip file
25gzname = os.path.basename(url)
26# extract name of cif file (split by sep, remove last, rejoin)
27cifname = os.extsep.join(gzname.split(os.extsep)[:-1])
28
29url_handle = urlopen(url)
30
31with open(gzname, "wb") as gzh:
32    print("Downloading file... (approx. 29 MB)")
33    while True:
34        data = url_handle.read(1024)
35        if len(data) == 0:
36            break
37        gzh.write(data)
38
39# size as of 13 April 2012
40if os.path.getsize(gzname) < 29944258:
41    warnings.warn("ERROR: Downloaded file is too small", RuntimeWarning)
42
43fh = gzip.open(gzname, "rb")
44
45# write extracted file to disk (not necessary)
46# with open(cifname, 'wb') as cifh:
47#     print("Extracting file...")
48#     cifh.write(fh.read())
49
50# The following code written by Hongbo Zhu
51# generate three_to_one_dict
52# two records in PDB Chemical Component Dictionary are parsed to
53# generate the dictionary:
54# _chem_comp.one_letter_code
55# _chem_comp.three_letter_code
56
57three_to_one_buf = []  # all three-letter codes
58three_to_one_buf_noq = []  # only those with non-'?' one-letter codes
59
60current_line = "to_one_letter_code = {"
61current_line_noq = "to_one_letter_code = {"
62
63found_one = False  # found one-letter code
64found_three = False  # found three-letter code
65
66counter = 0
67counter_noq = 0
68
69line = fh.readline()
70
71while line:
72    if line.startswith("_chem_comp.one_letter_code"):
73        one = line.strip().split()[-1]
74        found_one = True
75    if line.startswith("_chem_comp.three_letter_code"):
76        three = "%-3s" % (line.strip().split()[-1],)  # make it three-letter
77        found_three = True
78
79    if found_one and found_three:
80        if counter % 5 == 0:
81            three_to_one_buf.append(f"{current_line}\n")
82            current_line = "    "
83
84        current_line = f"{current_line}'{three}':'{one}',"
85        counter += 1
86
87        if one != "?":
88            if counter_noq % 5 == 0:
89                three_to_one_buf_noq.append(f"{current_line_noq}\n")
90                current_line_noq = "    "
91
92            current_line_noq = f"{current_line_noq}'{three}':'{one}',"
93            counter_noq += 1
94
95        found_one = False
96        found_three = False
97
98    line = fh.readline()
99
100if len(current_line) < 5:
101    three_to_one_buf[-1] = three_to_one_buf[:-1]  # remove the last comma
102    three_to_one_buf.append("}")
103else:
104    three_to_one_buf.append("%s }" % (current_line[:-1]))
105
106if len(current_line_noq) < 5:
107    three_to_one_buf_noq[-1] = three_to_one_buf_noq[:-1]
108    three_to_one_buf_noq.append("}")
109else:
110    three_to_one_buf_noq.append("%s }" % (current_line_noq[:-1]))
111
112# Find path of current script
113_scriptPath = os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0])
114# Path to SCOP module
115_rafPath = os.path.normpath(os.path.join(_scriptPath, "..", "..", "Bio", "SCOP"))
116_threeAllPath = os.path.join(_rafPath, "three_to_one_all.py")
117_threePath = os.path.join(_rafPath, "three_to_one_dict.py")
118
119# with open(_threeAllPath, 'w') as fh:
120#     fh.writelines(three_to_one_buf)
121with open(_threePath, "w") as fh:
122    fh.writelines(three_to_one_buf_noq)
123