1#!/usr/bin/env python 2""" 3/* ---------------------------------------------------------------------- 4 LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 5 https://www.lammps.org/ Sandia National Laboratories 6 Steve Plimpton, sjplimp@sandia.gov 7 8 Copyright (2003) Sandia Corporation. Under the terms of Contract 9 DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains 10 certain rights in this software. This software is distributed under 11 the GNU General Public License. 12 13 See the README file in the top-level LAMMPS directory. 14------------------------------------------------------------------------- */ 15 16/* ---------------------------------------------------------------------- 17 Contributing author: Oliver Henrich (University of Strathclyde, Glasgow) 18------------------------------------------------------------------------- */ 19""" 20 21 22""" 23Import basic modules 24""" 25import sys, os, timeit 26 27from timeit import default_timer as timer 28start_time = timer() 29""" 30Try to import numpy; if failed, import a local version mynumpy 31which needs to be provided 32""" 33try: 34 import numpy as np 35except: 36 print >> sys.stderr, "numpy not found. Exiting." 37 sys.exit(1) 38 39""" 40Check that the required arguments (box offset and size in simulation units 41and the sequence file were provided 42""" 43try: 44 box_offset = float(sys.argv[1]) 45 box_length = float(sys.argv[2]) 46 infile = sys.argv[3] 47except: 48 print >> sys.stderr, "Usage: %s <%s> <%s> <%s>" % (sys.argv[0], \ 49 "box offset", "box length", "file with sequences") 50 sys.exit(1) 51box = np.array ([box_length, box_length, box_length]) 52 53""" 54Try to open the file and fail gracefully if file cannot be opened 55""" 56try: 57 inp = open (infile, 'r') 58 inp.close() 59except: 60 print >> sys.stderr, "Could not open file '%s' for reading. \ 61 Aborting." % infile 62 sys.exit(2) 63 64# return parts of a string 65def partition(s, d): 66 if d in s: 67 sp = s.split(d, 1) 68 return sp[0], d, sp[1] 69 else: 70 return s, "", "" 71 72""" 73Define the model constants 74""" 75# set model constants 76PI = np.pi 77POS_BASE = 0.4 78POS_BACK = -0.4 79EXCL_RC1 = 0.711879214356 80EXCL_RC2 = 0.335388426126 81EXCL_RC3 = 0.52329943261 82 83""" 84Define auxiliary variables for the construction of a helix 85""" 86# center of the double strand 87CM_CENTER_DS = POS_BASE + 0.2 88 89# ideal distance between base sites of two nucleotides 90# which are to be base paired in a duplex 91BASE_BASE = 0.3897628551303122 92 93# cutoff distance for overlap check 94RC2 = 16 95 96# squares of the excluded volume distances for overlap check 97RC2_BACK = EXCL_RC1**2 98RC2_BASE = EXCL_RC2**2 99RC2_BACK_BASE = EXCL_RC3**2 100 101# enumeration to translate from letters to numbers and vice versa 102number_to_base = {1 : 'A', 2 : 'C', 3 : 'G', 4 : 'T'} 103base_to_number = {'A' : 1, 'a' : 1, 'C' : 2, 'c' : 2, 104 'G' : 3, 'g' : 3, 'T' : 4, 't' : 4} 105 106# auxiliary arrays 107positions = [] 108a1s = [] 109a3s = [] 110quaternions = [] 111 112newpositions = [] 113newa1s = [] 114newa3s = [] 115 116basetype = [] 117strandnum = [] 118 119bonds = [] 120 121""" 122Convert local body frame to quaternion DOF 123""" 124def exyz_to_quat (mya1, mya3): 125 126 mya2 = np.cross(mya3, mya1) 127 myquat = [1,0,0,0] 128 129 q0sq = 0.25 * (mya1[0] + mya2[1] + mya3[2] + 1.0) 130 q1sq = q0sq - 0.5 * (mya2[1] + mya3[2]) 131 q2sq = q0sq - 0.5 * (mya1[0] + mya3[2]) 132 q3sq = q0sq - 0.5 * (mya1[0] + mya2[1]) 133 134 # some component must be greater than 1/4 since they sum to 1 135 # compute other components from it 136 137 if q0sq >= 0.25: 138 myquat[0] = np.sqrt(q0sq) 139 myquat[1] = (mya2[2] - mya3[1]) / (4.0*myquat[0]) 140 myquat[2] = (mya3[0] - mya1[2]) / (4.0*myquat[0]) 141 myquat[3] = (mya1[1] - mya2[0]) / (4.0*myquat[0]) 142 elif q1sq >= 0.25: 143 myquat[1] = np.sqrt(q1sq) 144 myquat[0] = (mya2[2] - mya3[1]) / (4.0*myquat[1]) 145 myquat[2] = (mya2[0] + mya1[1]) / (4.0*myquat[1]) 146 myquat[3] = (mya1[2] + mya3[0]) / (4.0*myquat[1]) 147 elif q2sq >= 0.25: 148 myquat[2] = np.sqrt(q2sq) 149 myquat[0] = (mya3[0] - mya1[2]) / (4.0*myquat[2]) 150 myquat[1] = (mya2[0] + mya1[1]) / (4.0*myquat[2]) 151 myquat[3] = (mya3[1] + mya2[2]) / (4.0*myquat[2]) 152 elif q3sq >= 0.25: 153 myquat[3] = np.sqrt(q3sq) 154 myquat[0] = (mya1[1] - mya2[0]) / (4.0*myquat[3]) 155 myquat[1] = (mya3[0] + mya1[2]) / (4.0*myquat[3]) 156 myquat[2] = (mya3[1] + mya2[2]) / (4.0*myquat[3]) 157 158 norm = 1.0/np.sqrt(myquat[0]*myquat[0] + myquat[1]*myquat[1] + \ 159 myquat[2]*myquat[2] + myquat[3]*myquat[3]) 160 myquat[0] *= norm 161 myquat[1] *= norm 162 myquat[2] *= norm 163 myquat[3] *= norm 164 165 return np.array([myquat[0],myquat[1],myquat[2],myquat[3]]) 166 167""" 168Adds a strand to the system by appending it to the array of previous strands 169""" 170def add_strands (mynewpositions, mynewa1s, mynewa3s): 171 overlap = False 172 173 # This is a simple check for each of the particles where for previously 174 # placed particles i we check whether it overlaps with any of the 175 # newly created particles j 176 177 print >> sys.stdout, "## Checking for overlaps" 178 179 for i in xrange(len(positions)): 180 181 p = positions[i] 182 pa1 = a1s[i] 183 184 for j in xrange (len(mynewpositions)): 185 186 q = mynewpositions[j] 187 qa1 = mynewa1s[j] 188 189 # skip particles that are anyway too far away 190 dr = p - q 191 dr -= box * np.rint (dr / box) 192 if np.dot(dr, dr) > RC2: 193 continue 194 195 # base site and backbone site of the two particles 196 p_pos_back = p + pa1 * POS_BACK 197 p_pos_base = p + pa1 * POS_BASE 198 q_pos_back = q + qa1 * POS_BACK 199 q_pos_base = q + qa1 * POS_BASE 200 201 # check for no overlap between the two backbone sites 202 dr = p_pos_back - q_pos_back 203 dr -= box * np.rint (dr / box) 204 if np.dot(dr, dr) < RC2_BACK: 205 overlap = True 206 207 # check for no overlap between the two base sites 208 dr = p_pos_base - q_pos_base 209 dr -= box * np.rint (dr / box) 210 if np.dot(dr, dr) < RC2_BASE: 211 overlap = True 212 213 # check for no overlap between backbone site of particle p 214 # with base site of particle q 215 dr = p_pos_back - q_pos_base 216 dr -= box * np.rint (dr / box) 217 if np.dot(dr, dr) < RC2_BACK_BASE: 218 overlap = True 219 220 # check for no overlap between base site of particle p and 221 # backbone site of particle q 222 dr = p_pos_base - q_pos_back 223 dr -= box * np.rint (dr / box) 224 if np.dot(dr, dr) < RC2_BACK_BASE: 225 overlap = True 226 227 # exit if there is an overlap 228 if overlap: 229 return False 230 231 # append to the existing list if no overlap is found 232 if not overlap: 233 234 for p in mynewpositions: 235 positions.append(p) 236 for p in mynewa1s: 237 a1s.append (p) 238 for p in mynewa3s: 239 a3s.append (p) 240 # calculate quaternion from local body frame and append 241 for ia in xrange(len(mynewpositions)): 242 mynewquaternions = exyz_to_quat(mynewa1s[ia],mynewa3s[ia]) 243 quaternions.append(mynewquaternions) 244 245 return True 246 247 248""" 249Returns the rotation matrix defined by an axis and angle 250""" 251def get_rotation_matrix(axis, anglest): 252 # The argument anglest can be either an angle in radiants 253 # (accepted types are float, int or np.float64 or np.float64) 254 # or a tuple [angle, units] where angle is a number and 255 # units is a string. It tells the routine whether to use degrees, 256 # radiants (the default) or base pairs turns. 257 if not isinstance (anglest, (np.float64, np.float32, float, int)): 258 if len(anglest) > 1: 259 if anglest[1] in ["degrees", "deg", "o"]: 260 #angle = np.deg2rad (anglest[0]) 261 angle = (np.pi / 180.) * (anglest[0]) 262 elif anglest[1] in ["bp"]: 263 angle = int(anglest[0]) * (np.pi / 180.) * (35.9) 264 else: 265 angle = float(anglest[0]) 266 else: 267 angle = float(anglest[0]) 268 else: 269 angle = float(anglest) # in degrees (?) 270 271 axis = np.array(axis) 272 axis /= np.sqrt(np.dot(axis, axis)) 273 274 ct = np.cos(angle) 275 st = np.sin(angle) 276 olc = 1. - ct 277 x, y, z = axis 278 279 return np.array([[olc*x*x+ct, olc*x*y-st*z, olc*x*z+st*y], 280 [olc*x*y+st*z, olc*y*y+ct, olc*y*z-st*x], 281 [olc*x*z-st*y, olc*y*z+st*x, olc*z*z+ct]]) 282 283""" 284Generates the position and orientation vectors of a 285(single or double) strand from a sequence string 286""" 287def generate_strand(bp, sequence=None, start_pos=np.array([0, 0, 0]), \ 288 dir=np.array([0, 0, 1]), perp=False, double=True, rot=0.): 289 # generate empty arrays 290 mynewpositions, mynewa1s, mynewa3s = [], [], [] 291 292 # cast the provided start_pos array into a numpy array 293 start_pos = np.array(start_pos, dtype=float) 294 295 # overall direction of the helix 296 dir = np.array(dir, dtype=float) 297 if sequence == None: 298 sequence = np.random.randint(1, 5, bp) 299 300 # the elseif here is most likely redundant 301 elif len(sequence) != bp: 302 n = bp - len(sequence) 303 sequence += np.random.randint(1, 5, n) 304 print >> sys.stderr, "sequence is too short, adding %d random bases" % n 305 306 # normalize direction 307 dir_norm = np.sqrt(np.dot(dir,dir)) 308 if dir_norm < 1e-10: 309 print >> sys.stderr, "direction must be a valid vector, \ 310 defaulting to (0, 0, 1)" 311 dir = np.array([0, 0, 1]) 312 else: dir /= dir_norm 313 314 # find a vector orthogonal to dir to act as helix direction, 315 # if not provided switch off random orientation 316 if perp is None or perp is False: 317 v1 = np.random.random_sample(3) 318 v1 -= dir * (np.dot(dir, v1)) 319 v1 /= np.sqrt(sum(v1*v1)) 320 else: 321 v1 = perp; 322 323 # generate rotational matrix representing the overall rotation of the helix 324 R0 = get_rotation_matrix(dir, rot) 325 326 # rotation matrix corresponding to one step along the helix 327 R = get_rotation_matrix(dir, [1, "bp"]) 328 329 # set the vector a1 (backbone to base) to v1 330 a1 = v1 331 332 # apply the global rotation to a1 333 a1 = np.dot(R0, a1) 334 335 # set the position of the fist backbone site to start_pos 336 rb = np.array(start_pos) 337 338 # set a3 to the direction of the helix 339 a3 = dir 340 for i in range(bp): 341 # work out the position of the centre of mass of the nucleotide 342 rcdm = rb - CM_CENTER_DS * a1 343 344 # append to newpositions 345 mynewpositions.append(rcdm) 346 mynewa1s.append(a1) 347 mynewa3s.append(a3) 348 349 # if we are not at the end of the helix, we work out a1 and rb for the 350 # next nucleotide along the helix 351 if i != bp - 1: 352 a1 = np.dot(R, a1) 353 rb += a3 * BASE_BASE 354 355 # if we are working on a double strand, we do a cycle similar 356 # to the previous one but backwards 357 if double == True: 358 a1 = -a1 359 a3 = -dir 360 R = R.transpose() 361 for i in range(bp): 362 rcdm = rb - CM_CENTER_DS * a1 363 mynewpositions.append (rcdm) 364 mynewa1s.append (a1) 365 mynewa3s.append (a3) 366 a1 = np.dot(R, a1) 367 rb += a3 * BASE_BASE 368 369 assert (len (mynewpositions) > 0) 370 371 return [mynewpositions, mynewa1s, mynewa3s] 372 373 374""" 375Main function for this script. 376Reads a text file with the following format: 377- Each line contains the sequence for a single strand (A,C,G,T) 378- Lines beginning with the keyword 'DOUBLE' produce double-stranded DNA 379 380Ex: Two ssDNA (single stranded DNA) 381ATATATA 382GCGCGCG 383 384Ex: Two strands, one double stranded, the other single stranded. 385DOUBLE AGGGCT 386CCTGTA 387 388""" 389 390def read_strands(filename): 391 try: 392 infile = open (filename) 393 except: 394 print >> sys.stderr, "Could not open file '%s'. Aborting." % filename 395 sys.exit(2) 396 397 # This block works out the number of nucleotides and strands by reading 398 # the number of non-empty lines in the input file and the number of letters, 399 # taking the possible DOUBLE keyword into account. 400 nstrands, nnucl, nbonds = 0, 0, 0 401 lines = infile.readlines() 402 for line in lines: 403 line = line.upper().strip() 404 if len(line) == 0: 405 continue 406 if line[:6] == 'DOUBLE': 407 line = line.split()[1] 408 length = len(line) 409 print >> sys.stdout, "## Found duplex of %i base pairs" % length 410 nnucl += 2*length 411 nstrands += 2 412 nbonds += (2*length-2) 413 else: 414 line = line.split()[0] 415 length = len(line) 416 print >> sys.stdout, \ 417 "## Found single strand of %i bases" % length 418 nnucl += length 419 nstrands += 1 420 nbonds += length-1 421 # rewind the sequence input file 422 infile.seek(0) 423 424 print >> sys.stdout, "## nstrands, nnucl = ", nstrands, nnucl 425 426 # generate the data file in LAMMPS format 427 try: 428 out = open ("data.oxdna", "w") 429 except: 430 print >> sys.stderr, "Could not open data file for writing. Aborting." 431 sys.exit(2) 432 433 lines = infile.readlines() 434 nlines = len(lines) 435 i = 1 436 myns = 0 437 noffset = 1 438 439 for line in lines: 440 line = line.upper().strip() 441 442 # skip empty lines 443 if len(line) == 0: 444 i += 1 445 continue 446 447 # block for duplexes: last argument of the generate function 448 # is set to 'True' 449 if line[:6] == 'DOUBLE': 450 line = line.split()[1] 451 length = len(line) 452 seq = [(base_to_number[x]) for x in line] 453 454 myns += 1 455 for b in xrange(length): 456 basetype.append(seq[b]) 457 strandnum.append(myns) 458 459 for b in xrange(length-1): 460 bondpair = [noffset + b, noffset + b + 1] 461 bonds.append(bondpair) 462 noffset += length 463 464 # create the sequence of the second strand as made of 465 # complementary bases 466 seq2 = [5-s for s in seq] 467 seq2.reverse() 468 469 myns += 1 470 for b in xrange(length): 471 basetype.append(seq2[b]) 472 strandnum.append(myns) 473 474 for b in xrange(length-1): 475 bondpair = [noffset + b, noffset + b + 1] 476 bonds.append(bondpair) 477 noffset += length 478 479 print >> sys.stdout, "## Created duplex of %i bases" % (2*length) 480 481 # generate random position of the first nucleotide 482 cdm = box_offset + np.random.random_sample(3) * box 483 484 # generate the random direction of the helix 485 axis = np.random.random_sample(3) 486 axis /= np.sqrt(np.dot(axis, axis)) 487 488 # use the generate function defined above to create 489 # the position and orientation vector of the strand 490 newpositions, newa1s, newa3s = generate_strand(len(line), \ 491 sequence=seq, dir=axis, start_pos=cdm, double=True) 492 493 # generate a new position for the strand until it does not overlap 494 # with anything already present 495 start = timer() 496 while not add_strands(newpositions, newa1s, newa3s): 497 cdm = box_offset + np.random.random_sample(3) * box 498 axis = np.random.random_sample(3) 499 axis /= np.sqrt(np.dot(axis, axis)) 500 newpositions, newa1s, newa3s = generate_strand(len(line), \ 501 sequence=seq, dir=axis, start_pos=cdm, double=True) 502 print >> sys.stdout, "## Trying %i" % i 503 end = timer() 504 print >> sys.stdout, "## Added duplex of %i bases (line %i/%i) in %.2fs, now at %i/%i" % \ 505 (2*length, i, nlines, end-start, len(positions), nnucl) 506 507 # block for single strands: last argument of the generate function 508 # is set to 'False' 509 else: 510 length = len(line) 511 seq = [(base_to_number[x]) for x in line] 512 513 myns += 1 514 for b in xrange(length): 515 basetype.append(seq[b]) 516 strandnum.append(myns) 517 518 for b in xrange(length-1): 519 bondpair = [noffset + b, noffset + b + 1] 520 bonds.append(bondpair) 521 noffset += length 522 523 # generate random position of the first nucleotide 524 cdm = box_offset + np.random.random_sample(3) * box 525 526 # generate the random direction of the helix 527 axis = np.random.random_sample(3) 528 axis /= np.sqrt(np.dot(axis, axis)) 529 530 print >> sys.stdout, \ 531 "## Created single strand of %i bases" % length 532 533 newpositions, newa1s, newa3s = generate_strand(length, \ 534 sequence=seq, dir=axis, start_pos=cdm, double=False) 535 start = timer() 536 while not add_strands(newpositions, newa1s, newa3s): 537 cdm = box_offset + np.random.random_sample(3) * box 538 axis = np.random.random_sample(3) 539 axis /= np.sqrt(np.dot(axis, axis)) 540 newpositions, newa1s, newa3s = generate_strand(length, \ 541 sequence=seq, dir=axis, start_pos=cdm, double=False) 542 print >> sys.stdout, "## Trying %i" % (i) 543 end = timer() 544 print >> sys.stdout, "## Added single strand of %i bases (line %i/%i) in %.2fs, now at %i/%i" % \ 545 (length, i, nlines, end-start,len(positions), nnucl) 546 547 i += 1 548 549 # sanity check 550 if not len(positions) == nnucl: 551 print len(positions), nnucl 552 raise AssertionError 553 554 out.write('# LAMMPS data file\n') 555 out.write('%d atoms\n' % nnucl) 556 out.write('%d ellipsoids\n' % nnucl) 557 out.write('%d bonds\n' % nbonds) 558 out.write('\n') 559 out.write('4 atom types\n') 560 out.write('1 bond types\n') 561 out.write('\n') 562 out.write('# System size\n') 563 out.write('%f %f xlo xhi\n' % (box_offset,box_offset+box_length)) 564 out.write('%f %f ylo yhi\n' % (box_offset,box_offset+box_length)) 565 out.write('%f %f zlo zhi\n' % (box_offset,box_offset+box_length)) 566 567 out.write('\n') 568 out.write('Masses\n') 569 out.write('\n') 570 out.write('1 3.1575\n') 571 out.write('2 3.1575\n') 572 out.write('3 3.1575\n') 573 out.write('4 3.1575\n') 574 575 # for each nucleotide print a line under the headers 576 # Atoms, Velocities, Ellipsoids and Bonds 577 out.write('\n') 578 out.write(\ 579 '# Atom-ID, type, position, molecule-ID, ellipsoid flag, density\n') 580 out.write('Atoms\n') 581 out.write('\n') 582 583 for i in xrange(nnucl): 584 out.write('%d %d %22.15le %22.15le %22.15le %d 1 1\n' \ 585 % (i+1, basetype[i], \ 586 positions[i][0], positions[i][1], positions[i][2], \ 587 strandnum[i])) 588 589 out.write('\n') 590 out.write('# Atom-ID, translational, rotational velocity\n') 591 out.write('Velocities\n') 592 out.write('\n') 593 594 for i in xrange(nnucl): 595 out.write("%d %22.15le %22.15le %22.15le %22.15le %22.15le %22.15le\n" \ 596 % (i+1,0.0,0.0,0.0,0.0,0.0,0.0)) 597 598 out.write('\n') 599 out.write('# Atom-ID, shape, quaternion\n') 600 out.write('Ellipsoids\n') 601 out.write('\n') 602 603 for i in xrange(nnucl): 604 out.write(\ 605 "%d %22.15le %22.15le %22.15le %22.15le %22.15le %22.15le %22.15le\n" \ 606 % (i+1,1.1739845031423408,1.1739845031423408,1.1739845031423408, \ 607 quaternions[i][0],quaternions[i][1], quaternions[i][2],quaternions[i][3])) 608 609 out.write('\n') 610 out.write('# Bond topology\n') 611 out.write('Bonds\n') 612 out.write('\n') 613 614 for i in xrange(nbonds): 615 out.write("%d %d %d %d\n" % (i+1,1,bonds[i][0],bonds[i][1])) 616 617 out.close() 618 619 print >> sys.stdout, "## Wrote data to 'data.oxdna'" 620 print >> sys.stdout, "## DONE" 621 622# call the above main() function, which executes the program 623read_strands (infile) 624 625end_time=timer() 626runtime = end_time-start_time 627hours = runtime/3600 628minutes = (runtime-np.rint(hours)*3600)/60 629seconds = (runtime-np.rint(hours)*3600-np.rint(minutes)*60)%60 630print >> sys.stdout, "## Total runtime %ih:%im:%.2fs" % (hours,minutes,seconds) 631