1from __future__ import print_function
2
3import os
4import re
5import sys
6import struct
7import pymol
8import math
9from .xray import space_group_map
10
11
12if sys.byteorder == "big":
13    MACH_ARCH_CODE_INT = 1
14    MACH_ARCH_CODE_FLT = 1
15else:
16    MACH_ARCH_CODE_INT = 4
17    MACH_ARCH_CODE_FLT = 4
18
19class baseHeader:
20    """
21    A simple base class used to parse biological data headers
22    """
23    def __init__(self,filename):
24        self.filename  = filename
25        self.cols      = []
26        self.byteorder_int = None
27        self.byteorder_flt = None
28        self.wordsize  = None
29
30    def parseFile(self):
31        pass
32    def getColumns(self):
33        return self.cols
34    def getColumnsOfType(self,targetType):
35        pass
36    def checkFile(self):
37        return os.path.isfile(self.filename)
38
39    def guessCols(self,mapType):
40        fC = self.getColumnsOfType("F")
41        pC = self.getColumnsOfType("P")
42
43        looksLike, FCol, PCol = [None]*3
44
45        for prg in ("refmac", "phenix", "phenix_no_fill", "buster"):
46            #
47            # default (refmac,phenix,phenix_no_fill,buster) names are in the format:
48            #  "2FoFc-amplitude 2FoFc-phase FoFc-ampl FoFc-phase"
49            #
50            defaultNames = pymol.cmd.get_setting_text("default_%s_names" % (prg))
51
52            if len(defaultNames):
53                defaultNames = defaultNames.split()
54                if len(defaultNames):
55                    if mapType.lower() == "2fofc":
56                        defaultF, defaultP = defaultNames[0], defaultNames[1]
57                    elif mapType.lower() == "fofc":
58                        defaultF, defaultP = defaultNames[2], defaultNames[3]
59                else:
60                    print("Error: Please provide the setting 'default_%s_names' a comma separated string" % (prg))
61                    print("       with the values for 2FoFc and FoFc, amplitude and phase names, respectively.")
62                    return [None]*3
63            else:
64                print("Error: Please provide the setting 'default_%s_names' a comma separated string" % (prg))
65                print("       with the values for 2FoFc and FoFc, amplitude and phase names, respectively.")
66                return [None]*3
67
68
69            # defaultF =~ "FWT", "2FOFCFWT", "2FOFCWT_no_fill", "2FOFCWT", "DELFWT", "FOFCWT", ..
70            for curFCol in fC:
71                if curFCol.endswith(defaultF):
72                    # found F, now look for matching P
73                    pfx = curFCol.rstrip(defaultF)
74                    curPCol = pfx+defaultP
75
76                    ## print "curFCol = %s" % curFCol
77                    ## print "curPCol = %s" % curPCol
78
79                    if curPCol in pC:
80                        # found perfectly matching columns
81                        looksLike = prg
82                        FCol = curFCol
83                        PCol = curPCol
84                        return (FCol, PCol, looksLike)
85        return [None]*3
86
87
88
89class CNSHeader(baseHeader):
90    """
91    CNS format
92    """
93    def __init__(self,filename):
94        ## print "CNSHeader cstr"
95        baseHeader.__init__(self,filename)
96        self.parseFile()
97
98class CIFHeader(baseHeader):
99    """
100    mmCIF
101    """
102    def __init__(self,filename):
103        baseHeader.__init__(self,filename)
104        self.parseFile()
105
106    def parseFile(self):
107        if self.checkFile():
108            try:
109                inFile = open(self.filename,'rb')
110                data = inFile.readlines()
111                inFile.close()
112
113                in_loop = False
114
115                curLine = data.pop(0)
116                while curLine is not None:
117                    if in_loop:
118                        if curLine.startswith("_refln."):
119                            self.cols.append(curLine.split(".",1)[1].strip())
120                        else:
121                            in_loop=False
122                    else:
123                        if curLine.startswith("loop_"):
124                            in_loop=True
125                    if len(data):
126                        curLine = data.pop(0)
127                    else:
128                        curLine = None
129
130
131            except IOError as e:
132                print("Error-CIFReader: Couldn't read '%s' for input." % (self.filename))
133
134class MTZHeader(baseHeader):
135    HEADER_KEYWORDS = {
136	"VERS"    : "VERS",
137	"TITLE"   : "TITLE",
138	"NCOL"    : "NCOL",
139	"CELL"    : "CELL",
140	"SORT"    : "SORT",
141	"SYMINF"  : "SYMINF",
142	"SYMM"    : "SYMM",
143	"RESO"    : "RESO",
144	"VALM"    : "VALM",
145        "NDIF"    : "NDIF",
146	"COL"     : "COL",
147	"PROJECT" : "PROJECT",
148	"CRYSTAL" : "CRYSTAL",
149	"DATASET" : "DATASET",
150	"DCELL"   : "DCELL",
151	"DWAVEL"  : "DWAVEL",
152	"BATCH"   : "BATCH",
153        }
154
155    """
156    MTZ/CCP4
157    """
158    def __init__(self,filename):
159        baseHeader.__init__(self,filename)
160
161        self.version   = None
162        self.title     = None
163        self.ncol      = None
164        self.nrefl     = None
165        self.nbatches  = None
166        self.cell      = None
167        self.sort      = None
168        self.syminf    = None
169        self.symm      = []
170        self.reso_min  = None
171        self.reso_max  = None
172        self.valm      = None
173        self.ndif      = None
174        self.col       = None
175        self.project   = None
176        self.crystal   = None
177        self.dataset   = None
178        self.dcell     = None
179        self.dwavel    = None
180        self.batch     = None
181
182        self.datasets  = {}
183
184        self.parseFile()
185        self.format_cols()
186
187    def format_cols(self,colType=None):
188        """
189        updates self.cols to a list of cols of a
190        given MTZ type
191        """
192        self.cols = []
193
194        # UNIQUE:
195        # crystal_name/dataset_name/col_label"
196        s = "/"
197        c = []
198        for key in self.datasets:
199            c=[]
200            ## print "KEY = %s" % key
201            # user wants all columns
202            if colType is None:
203                c = list(self.datasets[key]["cols"].keys())
204            else:
205                # user wants a specfic column
206                for tmpCol in self.datasets[key]["cols"].keys():
207                    if self.datasets[key]["cols"][tmpCol]["type"]==colType:
208                        c.append(tmpCol)
209            ## print "Columns of type %s are: " % colType
210            ## print c
211            ## print "Now formatting...."
212            if "crystal" in self.datasets[key]:
213                curCryst = [ self.datasets[key]["crystal"] ] * len(c)
214            else:
215                curCryst = "nameless_crystal" * len(c)
216            ## print "CurCrystal is: %s." % curCryst
217            datName  = [ self.datasets[key]["name"] ] * len(c)
218            self.cols.extend(
219                ["/".join(x) for x in zip(curCryst,datName,c)])
220
221    def getColumns(self):
222        self.format_cols()
223        return baseHeader.getColumns(self)
224
225    def getColumnsOfType(self,colType):
226        self.format_cols(colType)
227        return baseHeader.getColumns(self)
228
229    def get_byteorder(self):
230        f = open(self.filename, 'rb')
231        f.seek(8)
232        d = struct.unpack("BBBB",f.read(4))
233        f.close()
234        return (d[1]>>4) & 0x0f, (d[0]>>4) & 0x0f
235
236    def authorStamp(self,f):
237        # seek the machine stamp
238        f.seek(8,0)
239        # read machine stamp
240        (B0,B1,B2,B3) = struct.unpack("BBBB", f.read(4))
241
242        # determines big or little endian:
243        # double, float, int, char
244        d = (B0 & 0xF0) >> 4
245        f = (B0 & 0x0F)
246        i = (B1 & 0xF0) >> 4
247        c = (B1 & 0x0F)
248
249        if d==1:
250            # big endian
251            self.byteorder_flt = ">"
252        elif d==4:
253            # little endian
254            self.byteorder_flt = "<"
255
256        if i==1:
257            # big
258            self.byteorder_int = ">"
259            self.wordsize = 1
260        elif i==4:
261            self.byteorder_int = "<"
262            self.wordsize = 4
263
264    def parseFile(self):
265        import shlex
266
267        if self.checkFile():
268            f = open(self.filename,'rb')
269
270            # sets authoring machine's details
271            self.authorStamp(f)
272
273            # get file size
274            f.seek(0,2)
275            file_len = f.tell()
276            # get header offset
277            f.seek(4)
278            if self.wordsize is not None:
279                (header_start,) = struct.unpack(self.byteorder_int+"i", f.read(4))
280            else:
281                print("Warning: Byte order of file unknown.  Guessing header location.")
282                (header_start,) = struct.unpack("i", f.read(4))
283
284            # bAdjust is the byte adjustment to compensate for
285            # older machines with smaller atomic sizes
286            host_word_size = len(struct.pack('i',0))
287            author_word_size = self.wordsize
288            if host_word_size<author_word_size:
289                bAdjust = author_word_size / host_word_size
290            elif host_word_size>author_word_size:
291                bAdjust = author_word_size * host_word_size
292            else:
293                bAdjust = host_word_size
294
295            header_start  = (header_start-1) * (bAdjust)
296
297            if file_len<header_start:
298                print("Error: File '%s' cannot be parsed because PyMOL cannot find the header.  If you think")
299                print("       PyMOL should be able to read this, plese send the file and this mesage to ")
300                print("       help@schrodinger.com.  Thanks!")
301
302            f.seek(header_start)
303
304            curLine = struct.unpack("80s", f.read(80))[0]
305            curLine = str(curLine.decode())
306
307            while not (curLine.startswith("END")):
308                # yank field identifier
309                (field, tokens) = curLine.split(" ",1)
310
311                H = MTZHeader.HEADER_KEYWORDS
312                try:
313                    if field.startswith(H["VERS"]):
314                        self.version = tokens
315                    elif field.startswith(H["TITLE"]):
316                        self.title = tokens
317                    elif field.startswith(H["NCOL"]):
318                        (self.ncols, self.nrefl, self.nbatches) = tokens.split()
319                    elif field.startswith(H["CELL"]):
320                        tokens = tokens.split()
321                        self.cell_dim    = tokens[:3]
322                        self.cell_angles = tokens[3:]
323                    elif field.startswith(H["SORT"]):
324                        self.sort = tokens.split()
325                    elif field.startswith(H["SYMINF"]):
326                        tokens = shlex.split(tokens)
327                        self.nsymmop  = tokens[0]
328                        self.nprimop  = tokens[1]
329                        self.lattice  = tokens[2]
330                        self.groupnum = tokens[3]
331                        self.space_group = tokens[4]
332                        self.pt_group    = tokens[5]
333
334                        # valid format for MTZ space group?
335                        self.space_group = space_group_map.get(self.space_group, self.space_group)
336
337                    elif field.startswith(H["SYMM"]):
338                        tokens = tokens.split()
339                        tokens = [x.strip(",").strip() for x in tokens]
340                        self.symm.append(tokens)
341                    elif field.startswith(H["RESO"]):
342                        self.reso_min, self.reso_max = tokens.split()
343                        self.reso_min = math.sqrt(1./float(self.reso_min))
344                        self.reso_max = math.sqrt(1./float(self.reso_max))
345                    elif field.startswith(H["VALM"]):
346                        self.missing_flag = tokens
347                    elif field.startswith(H["COL"]):
348                        if field == "COLSRC":
349                            raise pymol.cmd.QuietException
350                        (lab,typ,m,M,i) = tokens.split()
351                        if i not in self.datasets:
352                            self.datasets[i] = {}
353                            self.datasets[i]["cols"]  = {}
354
355                        self.datasets[i]["cols"][lab] = {}
356                        self.datasets[i]["cols"][lab]["type"] = typ
357                        self.datasets[i]["cols"][lab]["min"]  = m
358                        self.datasets[i]["cols"][lab]["max"]  = M
359                        self.datasets[i]["cols"][lab]["id"]   = i
360                    elif field.startswith(H["NDIF"]):
361                        self.ndif = int(tokens)
362                    elif field.startswith(H["PROJECT"]):
363                        (i,proj) = tokens.split(None, 1)
364                        if i not in self.datasets:
365                            self.datasets[i] = {}
366                        self.datasets[i]["project"] = proj.strip()
367                    elif field.startswith(H["CRYSTAL"]):
368                        # can have multiple crystals per dataset?
369                        # if, so not supported (overwritten) here.
370                        (i,cryst) = tokens.split(None, 1)
371                        if i not in self.datasets:
372                            self.datasets[i] = {}
373                        self.datasets[i]["crystal"] = cryst.strip()
374                    elif field.startswith(H["DATASET"]):
375                        (i,d) = tokens.split(None, 1)
376                        if i not in self.datasets:
377                            self.datasets[i] = {}
378                        self.datasets[i]["name"] = d.strip()
379                    elif field.startswith(H["DCELL"]):
380                        (i,x,y,z,a,b,g) = tokens.split()
381                        if i not in self.datasets:
382                            self.datasets[i] = {}
383                        self.datasets[i]['x'] = x
384                        self.datasets[i]['y'] = y
385                        self.datasets[i]['z'] = z
386                        self.datasets[i]['alpha'] = a
387                        self.datasets[i]['beta'] = b
388                        self.datasets[i]['gamma'] = g
389                    elif field.startswith(H["DWAVEL"]):
390                        (i,wl) = tokens.split()
391                        if i not in self.datasets:
392                            self.datasets[i] = {}
393                        self.datasets[i]["wavelength"] = wl
394                    elif field.startswith(H["BATCH"]):
395                        self.batch = tokens
396                    else:
397                        print("Error Parsing MTZ Header: bad column name: '%s'" % field)
398                except ValueError:
399                    print("Error: Parsing MTZ Header poorly formatted MTZ file")
400                    print("       bad field: '%s'" % field)
401                except pymol.cmd.QuietException:
402                    pass
403
404                curLine = struct.unpack("80s", f.read(80))[0]
405                curLine = str(curLine.decode())
406
407
408if __name__=="__main__":
409
410    c = CIFHeader("test.cif")
411    print(c.getColumns())
412
413    m = MTZHeader("test.mtz")
414    print(m.getColumns())
415