1from __future__ import print_function 2 3import os 4import re 5import sys 6import struct 7import pymol 8import math 9from .xray import space_group_map 10 11 12if sys.byteorder == "big": 13 MACH_ARCH_CODE_INT = 1 14 MACH_ARCH_CODE_FLT = 1 15else: 16 MACH_ARCH_CODE_INT = 4 17 MACH_ARCH_CODE_FLT = 4 18 19class baseHeader: 20 """ 21 A simple base class used to parse biological data headers 22 """ 23 def __init__(self,filename): 24 self.filename = filename 25 self.cols = [] 26 self.byteorder_int = None 27 self.byteorder_flt = None 28 self.wordsize = None 29 30 def parseFile(self): 31 pass 32 def getColumns(self): 33 return self.cols 34 def getColumnsOfType(self,targetType): 35 pass 36 def checkFile(self): 37 return os.path.isfile(self.filename) 38 39 def guessCols(self,mapType): 40 fC = self.getColumnsOfType("F") 41 pC = self.getColumnsOfType("P") 42 43 looksLike, FCol, PCol = [None]*3 44 45 for prg in ("refmac", "phenix", "phenix_no_fill", "buster"): 46 # 47 # default (refmac,phenix,phenix_no_fill,buster) names are in the format: 48 # "2FoFc-amplitude 2FoFc-phase FoFc-ampl FoFc-phase" 49 # 50 defaultNames = pymol.cmd.get_setting_text("default_%s_names" % (prg)) 51 52 if len(defaultNames): 53 defaultNames = defaultNames.split() 54 if len(defaultNames): 55 if mapType.lower() == "2fofc": 56 defaultF, defaultP = defaultNames[0], defaultNames[1] 57 elif mapType.lower() == "fofc": 58 defaultF, defaultP = defaultNames[2], defaultNames[3] 59 else: 60 print("Error: Please provide the setting 'default_%s_names' a comma separated string" % (prg)) 61 print(" with the values for 2FoFc and FoFc, amplitude and phase names, respectively.") 62 return [None]*3 63 else: 64 print("Error: Please provide the setting 'default_%s_names' a comma separated string" % (prg)) 65 print(" with the values for 2FoFc and FoFc, amplitude and phase names, respectively.") 66 return [None]*3 67 68 69 # defaultF =~ "FWT", "2FOFCFWT", "2FOFCWT_no_fill", "2FOFCWT", "DELFWT", "FOFCWT", .. 70 for curFCol in fC: 71 if curFCol.endswith(defaultF): 72 # found F, now look for matching P 73 pfx = curFCol.rstrip(defaultF) 74 curPCol = pfx+defaultP 75 76 ## print "curFCol = %s" % curFCol 77 ## print "curPCol = %s" % curPCol 78 79 if curPCol in pC: 80 # found perfectly matching columns 81 looksLike = prg 82 FCol = curFCol 83 PCol = curPCol 84 return (FCol, PCol, looksLike) 85 return [None]*3 86 87 88 89class CNSHeader(baseHeader): 90 """ 91 CNS format 92 """ 93 def __init__(self,filename): 94 ## print "CNSHeader cstr" 95 baseHeader.__init__(self,filename) 96 self.parseFile() 97 98class CIFHeader(baseHeader): 99 """ 100 mmCIF 101 """ 102 def __init__(self,filename): 103 baseHeader.__init__(self,filename) 104 self.parseFile() 105 106 def parseFile(self): 107 if self.checkFile(): 108 try: 109 inFile = open(self.filename,'rb') 110 data = inFile.readlines() 111 inFile.close() 112 113 in_loop = False 114 115 curLine = data.pop(0) 116 while curLine is not None: 117 if in_loop: 118 if curLine.startswith("_refln."): 119 self.cols.append(curLine.split(".",1)[1].strip()) 120 else: 121 in_loop=False 122 else: 123 if curLine.startswith("loop_"): 124 in_loop=True 125 if len(data): 126 curLine = data.pop(0) 127 else: 128 curLine = None 129 130 131 except IOError as e: 132 print("Error-CIFReader: Couldn't read '%s' for input." % (self.filename)) 133 134class MTZHeader(baseHeader): 135 HEADER_KEYWORDS = { 136 "VERS" : "VERS", 137 "TITLE" : "TITLE", 138 "NCOL" : "NCOL", 139 "CELL" : "CELL", 140 "SORT" : "SORT", 141 "SYMINF" : "SYMINF", 142 "SYMM" : "SYMM", 143 "RESO" : "RESO", 144 "VALM" : "VALM", 145 "NDIF" : "NDIF", 146 "COL" : "COL", 147 "PROJECT" : "PROJECT", 148 "CRYSTAL" : "CRYSTAL", 149 "DATASET" : "DATASET", 150 "DCELL" : "DCELL", 151 "DWAVEL" : "DWAVEL", 152 "BATCH" : "BATCH", 153 } 154 155 """ 156 MTZ/CCP4 157 """ 158 def __init__(self,filename): 159 baseHeader.__init__(self,filename) 160 161 self.version = None 162 self.title = None 163 self.ncol = None 164 self.nrefl = None 165 self.nbatches = None 166 self.cell = None 167 self.sort = None 168 self.syminf = None 169 self.symm = [] 170 self.reso_min = None 171 self.reso_max = None 172 self.valm = None 173 self.ndif = None 174 self.col = None 175 self.project = None 176 self.crystal = None 177 self.dataset = None 178 self.dcell = None 179 self.dwavel = None 180 self.batch = None 181 182 self.datasets = {} 183 184 self.parseFile() 185 self.format_cols() 186 187 def format_cols(self,colType=None): 188 """ 189 updates self.cols to a list of cols of a 190 given MTZ type 191 """ 192 self.cols = [] 193 194 # UNIQUE: 195 # crystal_name/dataset_name/col_label" 196 s = "/" 197 c = [] 198 for key in self.datasets: 199 c=[] 200 ## print "KEY = %s" % key 201 # user wants all columns 202 if colType is None: 203 c = list(self.datasets[key]["cols"].keys()) 204 else: 205 # user wants a specfic column 206 for tmpCol in self.datasets[key]["cols"].keys(): 207 if self.datasets[key]["cols"][tmpCol]["type"]==colType: 208 c.append(tmpCol) 209 ## print "Columns of type %s are: " % colType 210 ## print c 211 ## print "Now formatting...." 212 if "crystal" in self.datasets[key]: 213 curCryst = [ self.datasets[key]["crystal"] ] * len(c) 214 else: 215 curCryst = "nameless_crystal" * len(c) 216 ## print "CurCrystal is: %s." % curCryst 217 datName = [ self.datasets[key]["name"] ] * len(c) 218 self.cols.extend( 219 ["/".join(x) for x in zip(curCryst,datName,c)]) 220 221 def getColumns(self): 222 self.format_cols() 223 return baseHeader.getColumns(self) 224 225 def getColumnsOfType(self,colType): 226 self.format_cols(colType) 227 return baseHeader.getColumns(self) 228 229 def get_byteorder(self): 230 f = open(self.filename, 'rb') 231 f.seek(8) 232 d = struct.unpack("BBBB",f.read(4)) 233 f.close() 234 return (d[1]>>4) & 0x0f, (d[0]>>4) & 0x0f 235 236 def authorStamp(self,f): 237 # seek the machine stamp 238 f.seek(8,0) 239 # read machine stamp 240 (B0,B1,B2,B3) = struct.unpack("BBBB", f.read(4)) 241 242 # determines big or little endian: 243 # double, float, int, char 244 d = (B0 & 0xF0) >> 4 245 f = (B0 & 0x0F) 246 i = (B1 & 0xF0) >> 4 247 c = (B1 & 0x0F) 248 249 if d==1: 250 # big endian 251 self.byteorder_flt = ">" 252 elif d==4: 253 # little endian 254 self.byteorder_flt = "<" 255 256 if i==1: 257 # big 258 self.byteorder_int = ">" 259 self.wordsize = 1 260 elif i==4: 261 self.byteorder_int = "<" 262 self.wordsize = 4 263 264 def parseFile(self): 265 import shlex 266 267 if self.checkFile(): 268 f = open(self.filename,'rb') 269 270 # sets authoring machine's details 271 self.authorStamp(f) 272 273 # get file size 274 f.seek(0,2) 275 file_len = f.tell() 276 # get header offset 277 f.seek(4) 278 if self.wordsize is not None: 279 (header_start,) = struct.unpack(self.byteorder_int+"i", f.read(4)) 280 else: 281 print("Warning: Byte order of file unknown. Guessing header location.") 282 (header_start,) = struct.unpack("i", f.read(4)) 283 284 # bAdjust is the byte adjustment to compensate for 285 # older machines with smaller atomic sizes 286 host_word_size = len(struct.pack('i',0)) 287 author_word_size = self.wordsize 288 if host_word_size<author_word_size: 289 bAdjust = author_word_size / host_word_size 290 elif host_word_size>author_word_size: 291 bAdjust = author_word_size * host_word_size 292 else: 293 bAdjust = host_word_size 294 295 header_start = (header_start-1) * (bAdjust) 296 297 if file_len<header_start: 298 print("Error: File '%s' cannot be parsed because PyMOL cannot find the header. If you think") 299 print(" PyMOL should be able to read this, plese send the file and this mesage to ") 300 print(" help@schrodinger.com. Thanks!") 301 302 f.seek(header_start) 303 304 curLine = struct.unpack("80s", f.read(80))[0] 305 curLine = str(curLine.decode()) 306 307 while not (curLine.startswith("END")): 308 # yank field identifier 309 (field, tokens) = curLine.split(" ",1) 310 311 H = MTZHeader.HEADER_KEYWORDS 312 try: 313 if field.startswith(H["VERS"]): 314 self.version = tokens 315 elif field.startswith(H["TITLE"]): 316 self.title = tokens 317 elif field.startswith(H["NCOL"]): 318 (self.ncols, self.nrefl, self.nbatches) = tokens.split() 319 elif field.startswith(H["CELL"]): 320 tokens = tokens.split() 321 self.cell_dim = tokens[:3] 322 self.cell_angles = tokens[3:] 323 elif field.startswith(H["SORT"]): 324 self.sort = tokens.split() 325 elif field.startswith(H["SYMINF"]): 326 tokens = shlex.split(tokens) 327 self.nsymmop = tokens[0] 328 self.nprimop = tokens[1] 329 self.lattice = tokens[2] 330 self.groupnum = tokens[3] 331 self.space_group = tokens[4] 332 self.pt_group = tokens[5] 333 334 # valid format for MTZ space group? 335 self.space_group = space_group_map.get(self.space_group, self.space_group) 336 337 elif field.startswith(H["SYMM"]): 338 tokens = tokens.split() 339 tokens = [x.strip(",").strip() for x in tokens] 340 self.symm.append(tokens) 341 elif field.startswith(H["RESO"]): 342 self.reso_min, self.reso_max = tokens.split() 343 self.reso_min = math.sqrt(1./float(self.reso_min)) 344 self.reso_max = math.sqrt(1./float(self.reso_max)) 345 elif field.startswith(H["VALM"]): 346 self.missing_flag = tokens 347 elif field.startswith(H["COL"]): 348 if field == "COLSRC": 349 raise pymol.cmd.QuietException 350 (lab,typ,m,M,i) = tokens.split() 351 if i not in self.datasets: 352 self.datasets[i] = {} 353 self.datasets[i]["cols"] = {} 354 355 self.datasets[i]["cols"][lab] = {} 356 self.datasets[i]["cols"][lab]["type"] = typ 357 self.datasets[i]["cols"][lab]["min"] = m 358 self.datasets[i]["cols"][lab]["max"] = M 359 self.datasets[i]["cols"][lab]["id"] = i 360 elif field.startswith(H["NDIF"]): 361 self.ndif = int(tokens) 362 elif field.startswith(H["PROJECT"]): 363 (i,proj) = tokens.split(None, 1) 364 if i not in self.datasets: 365 self.datasets[i] = {} 366 self.datasets[i]["project"] = proj.strip() 367 elif field.startswith(H["CRYSTAL"]): 368 # can have multiple crystals per dataset? 369 # if, so not supported (overwritten) here. 370 (i,cryst) = tokens.split(None, 1) 371 if i not in self.datasets: 372 self.datasets[i] = {} 373 self.datasets[i]["crystal"] = cryst.strip() 374 elif field.startswith(H["DATASET"]): 375 (i,d) = tokens.split(None, 1) 376 if i not in self.datasets: 377 self.datasets[i] = {} 378 self.datasets[i]["name"] = d.strip() 379 elif field.startswith(H["DCELL"]): 380 (i,x,y,z,a,b,g) = tokens.split() 381 if i not in self.datasets: 382 self.datasets[i] = {} 383 self.datasets[i]['x'] = x 384 self.datasets[i]['y'] = y 385 self.datasets[i]['z'] = z 386 self.datasets[i]['alpha'] = a 387 self.datasets[i]['beta'] = b 388 self.datasets[i]['gamma'] = g 389 elif field.startswith(H["DWAVEL"]): 390 (i,wl) = tokens.split() 391 if i not in self.datasets: 392 self.datasets[i] = {} 393 self.datasets[i]["wavelength"] = wl 394 elif field.startswith(H["BATCH"]): 395 self.batch = tokens 396 else: 397 print("Error Parsing MTZ Header: bad column name: '%s'" % field) 398 except ValueError: 399 print("Error: Parsing MTZ Header poorly formatted MTZ file") 400 print(" bad field: '%s'" % field) 401 except pymol.cmd.QuietException: 402 pass 403 404 curLine = struct.unpack("80s", f.read(80))[0] 405 curLine = str(curLine.decode()) 406 407 408if __name__=="__main__": 409 410 c = CIFHeader("test.cif") 411 print(c.getColumns()) 412 413 m = MTZHeader("test.mtz") 414 print(m.getColumns()) 415