1#!/usr/bin/env python
2"""
3/* ----------------------------------------------------------------------
4   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
5   https://www.lammps.org/ Sandia National Laboratories
6   Steve Plimpton, sjplimp@sandia.gov
7
8   Copyright (2003) Sandia Corporation.  Under the terms of Contract
9   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
10   certain rights in this software.  This software is distributed under
11   the GNU General Public License.
12
13   See the README file in the top-level LAMMPS directory.
14------------------------------------------------------------------------- */
15
16/* ----------------------------------------------------------------------
17   Contributing author: Oliver Henrich (University of Strathclyde, Glasgow)
18------------------------------------------------------------------------- */
19"""
20
21
22"""
23Import basic modules
24"""
25import sys, os, timeit
26
27from timeit import default_timer as timer
28start_time = timer()
29"""
30Try to import numpy; if failed, import a local version mynumpy
31which needs to be provided
32"""
33try:
34    import numpy as np
35except:
36    print >> sys.stderr, "numpy not found. Exiting."
37    sys.exit(1)
38
39"""
40Check that the required arguments (box offset and size in simulation units
41and the sequence file were provided
42"""
43try:
44    box_offset = float(sys.argv[1])
45    box_length = float(sys.argv[2])
46    infile = sys.argv[3]
47except:
48    print >> sys.stderr, "Usage: %s <%s> <%s> <%s>" % (sys.argv[0], \
49	"box offset", "box length", "file with sequences")
50    sys.exit(1)
51box = np.array ([box_length, box_length, box_length])
52
53"""
54Try to open the file and fail gracefully if file cannot be opened
55"""
56try:
57    inp = open (infile, 'r')
58    inp.close()
59except:
60    print >> sys.stderr, "Could not open file '%s' for reading. \
61					      Aborting." % infile
62    sys.exit(2)
63
64# return parts of a string
65def partition(s, d):
66    if d in s:
67        sp = s.split(d, 1)
68        return sp[0], d, sp[1]
69    else:
70        return s, "", ""
71
72"""
73Define the model constants
74"""
75# set model constants
76PI = np.pi
77POS_BASE =  0.4
78POS_BACK = -0.4
79EXCL_RC1 =  0.711879214356
80EXCL_RC2 =  0.335388426126
81EXCL_RC3 =  0.52329943261
82
83"""
84Define auxiliary variables for the construction of a helix
85"""
86# center of the double strand
87CM_CENTER_DS = POS_BASE + 0.2
88
89# ideal distance between base sites of two nucleotides
90# which are to be base paired in a duplex
91BASE_BASE = 0.3897628551303122
92
93# cutoff distance for overlap check
94RC2 = 16
95
96# squares of the excluded volume distances for overlap check
97RC2_BACK = EXCL_RC1**2
98RC2_BASE = EXCL_RC2**2
99RC2_BACK_BASE = EXCL_RC3**2
100
101# enumeration to translate from letters to numbers and vice versa
102number_to_base = {1 : 'A', 2 : 'C', 3 : 'G', 4 : 'T'}
103base_to_number = {'A' : 1, 'a' : 1, 'C' : 2, 'c' : 2,
104                  'G' : 3, 'g' : 3, 'T' : 4, 't' : 4}
105
106# auxiliary arrays
107positions = []
108a1s = []
109a3s = []
110quaternions = []
111
112newpositions = []
113newa1s = []
114newa3s = []
115
116basetype  = []
117strandnum = []
118
119bonds = []
120
121"""
122Convert local body frame to quaternion DOF
123"""
124def exyz_to_quat (mya1, mya3):
125
126    mya2 = np.cross(mya3, mya1)
127    myquat = [1,0,0,0]
128
129    q0sq = 0.25 * (mya1[0] + mya2[1] + mya3[2] + 1.0)
130    q1sq = q0sq - 0.5 * (mya2[1] + mya3[2])
131    q2sq = q0sq - 0.5 * (mya1[0] + mya3[2])
132    q3sq = q0sq - 0.5 * (mya1[0] + mya2[1])
133
134    # some component must be greater than 1/4 since they sum to 1
135    # compute other components from it
136
137    if q0sq >= 0.25:
138	myquat[0] = np.sqrt(q0sq)
139	myquat[1] = (mya2[2] - mya3[1]) / (4.0*myquat[0])
140	myquat[2] = (mya3[0] - mya1[2]) / (4.0*myquat[0])
141	myquat[3] = (mya1[1] - mya2[0]) / (4.0*myquat[0])
142    elif q1sq >= 0.25:
143	myquat[1] = np.sqrt(q1sq)
144	myquat[0] = (mya2[2] - mya3[1]) / (4.0*myquat[1])
145	myquat[2] = (mya2[0] + mya1[1]) / (4.0*myquat[1])
146	myquat[3] = (mya1[2] + mya3[0]) / (4.0*myquat[1])
147    elif q2sq >= 0.25:
148	myquat[2] = np.sqrt(q2sq)
149	myquat[0] = (mya3[0] - mya1[2]) / (4.0*myquat[2])
150	myquat[1] = (mya2[0] + mya1[1]) / (4.0*myquat[2])
151	myquat[3] = (mya3[1] + mya2[2]) / (4.0*myquat[2])
152    elif q3sq >= 0.25:
153	myquat[3] = np.sqrt(q3sq)
154	myquat[0] = (mya1[1] - mya2[0]) / (4.0*myquat[3])
155	myquat[1] = (mya3[0] + mya1[2]) / (4.0*myquat[3])
156	myquat[2] = (mya3[1] + mya2[2]) / (4.0*myquat[3])
157
158    norm = 1.0/np.sqrt(myquat[0]*myquat[0] + myquat[1]*myquat[1] + \
159			  myquat[2]*myquat[2] + myquat[3]*myquat[3])
160    myquat[0] *= norm
161    myquat[1] *= norm
162    myquat[2] *= norm
163    myquat[3] *= norm
164
165    return np.array([myquat[0],myquat[1],myquat[2],myquat[3]])
166
167"""
168Adds a strand to the system by appending it to the array of previous strands
169"""
170def add_strands (mynewpositions, mynewa1s, mynewa3s):
171    overlap = False
172
173    # This is a simple check for each of the particles where for previously
174    # placed particles i we check whether it overlaps with any of the
175    # newly created particles j
176
177    print >> sys.stdout, "## Checking for overlaps"
178
179    for i in xrange(len(positions)):
180
181	p = positions[i]
182	pa1 = a1s[i]
183
184	for j in xrange (len(mynewpositions)):
185
186	    q = mynewpositions[j]
187	    qa1 = mynewa1s[j]
188
189	    # skip particles that are anyway too far away
190	    dr = p - q
191	    dr -= box * np.rint (dr / box)
192	    if np.dot(dr, dr) > RC2:
193		continue
194
195	    # base site and backbone site of the two particles
196            p_pos_back = p + pa1 * POS_BACK
197            p_pos_base = p + pa1 * POS_BASE
198            q_pos_back = q + qa1 * POS_BACK
199            q_pos_base = q + qa1 * POS_BASE
200
201	    # check for no overlap between the two backbone sites
202            dr = p_pos_back - q_pos_back
203            dr -= box * np.rint (dr / box)
204            if np.dot(dr, dr) < RC2_BACK:
205                overlap = True
206
207	    # check for no overlap between the two base sites
208            dr = p_pos_base -  q_pos_base
209            dr -= box * np.rint (dr / box)
210            if np.dot(dr, dr) < RC2_BASE:
211                overlap = True
212
213	    # check for no overlap between backbone site of particle p
214	    # with base site of particle q
215            dr = p_pos_back - q_pos_base
216            dr -= box * np.rint (dr / box)
217            if np.dot(dr, dr) < RC2_BACK_BASE:
218                overlap = True
219
220	    # check for no overlap between base site of particle p and
221	    # backbone site of particle q
222            dr = p_pos_base - q_pos_back
223            dr -= box * np.rint (dr / box)
224            if np.dot(dr, dr) < RC2_BACK_BASE:
225                overlap = True
226
227	    # exit if there is an overlap
228            if overlap:
229                return False
230
231    # append to the existing list if no overlap is found
232    if not overlap:
233
234        for p in mynewpositions:
235            positions.append(p)
236        for p in mynewa1s:
237            a1s.append (p)
238        for p in mynewa3s:
239            a3s.append (p)
240	# calculate quaternion from local body frame and append
241	for ia in xrange(len(mynewpositions)):
242	    mynewquaternions = exyz_to_quat(mynewa1s[ia],mynewa3s[ia])
243	    quaternions.append(mynewquaternions)
244
245    return True
246
247
248"""
249Returns the rotation matrix defined by an axis and angle
250"""
251def get_rotation_matrix(axis, anglest):
252    # The argument anglest can be either an angle in radiants
253    # (accepted types are float, int or np.float64 or np.float64)
254    # or a tuple [angle, units] where angle is a number and
255    # units is a string. It tells the routine whether to use degrees,
256    # radiants (the default) or base pairs turns.
257    if not isinstance (anglest, (np.float64, np.float32, float, int)):
258        if len(anglest) > 1:
259            if anglest[1] in ["degrees", "deg", "o"]:
260                #angle = np.deg2rad (anglest[0])
261                angle = (np.pi / 180.) * (anglest[0])
262            elif anglest[1] in ["bp"]:
263                angle = int(anglest[0]) * (np.pi / 180.) * (35.9)
264            else:
265                angle = float(anglest[0])
266        else:
267            angle = float(anglest[0])
268    else:
269        angle = float(anglest) # in degrees (?)
270
271    axis = np.array(axis)
272    axis /= np.sqrt(np.dot(axis, axis))
273
274    ct = np.cos(angle)
275    st = np.sin(angle)
276    olc = 1. - ct
277    x, y, z = axis
278
279    return np.array([[olc*x*x+ct, olc*x*y-st*z, olc*x*z+st*y],
280                    [olc*x*y+st*z, olc*y*y+ct, olc*y*z-st*x],
281                    [olc*x*z-st*y, olc*y*z+st*x, olc*z*z+ct]])
282
283"""
284Generates the position and orientation vectors of a
285(single or double) strand from a sequence string
286"""
287def generate_strand(bp, sequence=None, start_pos=np.array([0, 0, 0]), \
288	  dir=np.array([0, 0, 1]), perp=False, double=True, rot=0.):
289    # generate empty arrays
290    mynewpositions, mynewa1s, mynewa3s = [], [], []
291
292    # cast the provided start_pos array into a numpy array
293    start_pos = np.array(start_pos, dtype=float)
294
295    # overall direction of the helix
296    dir = np.array(dir, dtype=float)
297    if sequence == None:
298	sequence = np.random.randint(1, 5, bp)
299
300    # the elseif here is most likely redundant
301    elif len(sequence) != bp:
302	n = bp - len(sequence)
303	sequence += np.random.randint(1, 5, n)
304	print >> sys.stderr, "sequence is too short, adding %d random bases" % n
305
306    # normalize direction
307    dir_norm = np.sqrt(np.dot(dir,dir))
308    if dir_norm < 1e-10:
309	print >> sys.stderr, "direction must be a valid vector, \
310			      defaulting to (0, 0, 1)"
311	dir = np.array([0, 0, 1])
312    else: dir /= dir_norm
313
314    # find a vector orthogonal to dir to act as helix direction,
315    # if not provided switch off random orientation
316    if perp is None or perp is False:
317	v1 = np.random.random_sample(3)
318	v1 -= dir * (np.dot(dir, v1))
319	v1 /= np.sqrt(sum(v1*v1))
320    else:
321	v1 = perp;
322
323    # generate rotational matrix representing the overall rotation of the helix
324    R0 = get_rotation_matrix(dir, rot)
325
326    # rotation matrix corresponding to one step along the helix
327    R = get_rotation_matrix(dir, [1, "bp"])
328
329    # set the vector a1 (backbone to base) to v1
330    a1 = v1
331
332    # apply the global rotation to a1
333    a1 = np.dot(R0, a1)
334
335    # set the position of the fist backbone site to start_pos
336    rb = np.array(start_pos)
337
338    # set a3 to the direction of the helix
339    a3 = dir
340    for i in range(bp):
341    # work out the position of the centre of mass of the nucleotide
342	rcdm = rb - CM_CENTER_DS * a1
343
344	# append to newpositions
345	mynewpositions.append(rcdm)
346	mynewa1s.append(a1)
347	mynewa3s.append(a3)
348
349	# if we are not at the end of the helix, we work out a1 and rb for the
350	# next nucleotide along the helix
351	if i != bp - 1:
352	    a1 = np.dot(R, a1)
353	    rb += a3 * BASE_BASE
354
355    # if we are working on a double strand, we do a cycle similar
356    # to the previous one but backwards
357    if double == True:
358	a1 = -a1
359	a3 = -dir
360	R = R.transpose()
361	for i in range(bp):
362	    rcdm = rb - CM_CENTER_DS * a1
363	    mynewpositions.append (rcdm)
364	    mynewa1s.append (a1)
365	    mynewa3s.append (a3)
366	    a1 = np.dot(R, a1)
367	    rb += a3 * BASE_BASE
368
369    assert (len (mynewpositions) > 0)
370
371    return [mynewpositions, mynewa1s, mynewa3s]
372
373
374"""
375Main function for this script.
376Reads a text file with the following format:
377- Each line contains the sequence for a single strand (A,C,G,T)
378- Lines beginning with the keyword 'DOUBLE' produce double-stranded DNA
379
380Ex: Two ssDNA (single stranded DNA)
381ATATATA
382GCGCGCG
383
384Ex: Two strands, one double stranded, the other single stranded.
385DOUBLE AGGGCT
386CCTGTA
387
388"""
389
390def read_strands(filename):
391    try:
392        infile = open (filename)
393    except:
394        print >> sys.stderr, "Could not open file '%s'. Aborting." % filename
395        sys.exit(2)
396
397    # This block works out the number of nucleotides and strands by reading
398    # the number of non-empty lines in the input file and the number of letters,
399    # taking the possible DOUBLE keyword into account.
400    nstrands, nnucl, nbonds = 0, 0, 0
401    lines = infile.readlines()
402    for line in lines:
403        line = line.upper().strip()
404        if len(line) == 0:
405            continue
406        if line[:6] == 'DOUBLE':
407            line = line.split()[1]
408            length = len(line)
409            print >> sys.stdout, "## Found duplex of %i base pairs" % length
410            nnucl += 2*length
411            nstrands += 2
412	    nbonds += (2*length-2)
413        else:
414            line = line.split()[0]
415            length = len(line)
416            print >> sys.stdout, \
417		    "## Found single strand of %i bases" % length
418            nnucl += length
419            nstrands += 1
420	    nbonds += length-1
421    # rewind the sequence input file
422    infile.seek(0)
423
424    print >> sys.stdout, "## nstrands, nnucl = ", nstrands, nnucl
425
426    # generate the data file in LAMMPS format
427    try:
428        out = open ("data.oxdna", "w")
429    except:
430        print >> sys.stderr, "Could not open data file for writing. Aborting."
431        sys.exit(2)
432
433    lines = infile.readlines()
434    nlines = len(lines)
435    i = 1
436    myns = 0
437    noffset = 1
438
439    for line in lines:
440        line = line.upper().strip()
441
442        # skip empty lines
443        if len(line) == 0:
444	    i += 1
445	    continue
446
447	# block for duplexes: last argument of the generate function
448	# is set to 'True'
449        if line[:6] == 'DOUBLE':
450            line = line.split()[1]
451            length = len(line)
452            seq = [(base_to_number[x]) for x in line]
453
454	    myns += 1
455	    for b in xrange(length):
456		basetype.append(seq[b])
457		strandnum.append(myns)
458
459	    for b in xrange(length-1):
460		bondpair = [noffset + b, noffset + b + 1]
461		bonds.append(bondpair)
462	    noffset += length
463
464	    # create the sequence of the second strand as made of
465	    # complementary bases
466	    seq2 = [5-s for s in seq]
467	    seq2.reverse()
468
469	    myns += 1
470	    for b in xrange(length):
471		basetype.append(seq2[b])
472		strandnum.append(myns)
473
474	    for b in xrange(length-1):
475		bondpair = [noffset + b, noffset + b + 1]
476		bonds.append(bondpair)
477	    noffset += length
478
479            print >> sys.stdout, "## Created duplex of %i bases" % (2*length)
480
481	    # generate random position of the first nucleotide
482            cdm = box_offset + np.random.random_sample(3) * box
483
484            # generate the random direction of the helix
485            axis = np.random.random_sample(3)
486            axis /= np.sqrt(np.dot(axis, axis))
487
488            # use the generate function defined above to create
489	    # the position and orientation vector of the strand
490            newpositions, newa1s, newa3s = generate_strand(len(line), \
491		    sequence=seq, dir=axis, start_pos=cdm, double=True)
492
493            # generate a new position for the strand until it does not overlap
494	    # with anything already present
495	    start = timer()
496            while not add_strands(newpositions, newa1s, newa3s):
497                cdm = box_offset + np.random.random_sample(3) * box
498                axis = np.random.random_sample(3)
499                axis /= np.sqrt(np.dot(axis, axis))
500                newpositions, newa1s, newa3s = generate_strand(len(line), \
501		      sequence=seq, dir=axis, start_pos=cdm, double=True)
502                print >> sys.stdout, "## Trying %i" % i
503	    end = timer()
504            print >> sys.stdout, "## Added duplex of %i bases (line %i/%i) in %.2fs, now at %i/%i" % \
505				      (2*length, i, nlines, end-start, len(positions), nnucl)
506
507	# block for single strands: last argument of the generate function
508	# is set to 'False'
509        else:
510            length = len(line)
511            seq = [(base_to_number[x]) for x in line]
512
513	    myns += 1
514	    for b in xrange(length):
515		basetype.append(seq[b])
516		strandnum.append(myns)
517
518	    for b in xrange(length-1):
519		bondpair = [noffset + b, noffset + b + 1]
520		bonds.append(bondpair)
521	    noffset += length
522
523	    # generate random position of the first nucleotide
524            cdm = box_offset + np.random.random_sample(3) * box
525
526            # generate the random direction of the helix
527            axis = np.random.random_sample(3)
528            axis /= np.sqrt(np.dot(axis, axis))
529
530            print >> sys.stdout, \
531		      "## Created single strand of %i bases" % length
532
533            newpositions, newa1s, newa3s = generate_strand(length, \
534		      sequence=seq, dir=axis, start_pos=cdm, double=False)
535	    start = timer()
536            while not add_strands(newpositions, newa1s, newa3s):
537                cdm = box_offset + np.random.random_sample(3) * box
538                axis = np.random.random_sample(3)
539		axis /= np.sqrt(np.dot(axis, axis))
540                newpositions, newa1s, newa3s = generate_strand(length, \
541			  sequence=seq, dir=axis, start_pos=cdm, double=False)
542                print >> sys.stdout, "## Trying  %i" % (i)
543	    end = timer()
544            print >> sys.stdout, "## Added single strand of %i bases (line %i/%i) in %.2fs, now at %i/%i" % \
545				      (length, i, nlines, end-start,len(positions), nnucl)
546
547        i += 1
548
549    # sanity check
550    if not len(positions) == nnucl:
551        print len(positions), nnucl
552        raise AssertionError
553
554    out.write('# LAMMPS data file\n')
555    out.write('%d atoms\n' % nnucl)
556    out.write('%d ellipsoids\n' % nnucl)
557    out.write('%d bonds\n' % nbonds)
558    out.write('\n')
559    out.write('4 atom types\n')
560    out.write('1 bond types\n')
561    out.write('\n')
562    out.write('# System size\n')
563    out.write('%f %f xlo xhi\n' % (box_offset,box_offset+box_length))
564    out.write('%f %f ylo yhi\n' % (box_offset,box_offset+box_length))
565    out.write('%f %f zlo zhi\n' % (box_offset,box_offset+box_length))
566
567    out.write('\n')
568    out.write('Masses\n')
569    out.write('\n')
570    out.write('1 3.1575\n')
571    out.write('2 3.1575\n')
572    out.write('3 3.1575\n')
573    out.write('4 3.1575\n')
574
575    # for each nucleotide print a line under the headers
576    # Atoms, Velocities, Ellipsoids and Bonds
577    out.write('\n')
578    out.write(\
579      '# Atom-ID, type, position, molecule-ID, ellipsoid flag, density\n')
580    out.write('Atoms\n')
581    out.write('\n')
582
583    for i in xrange(nnucl):
584	out.write('%d %d %22.15le %22.15le %22.15le %d 1 1\n' \
585		  % (i+1, basetype[i], \
586		     positions[i][0], positions[i][1], positions[i][2], \
587		     strandnum[i]))
588
589    out.write('\n')
590    out.write('# Atom-ID, translational, rotational velocity\n')
591    out.write('Velocities\n')
592    out.write('\n')
593
594    for i in xrange(nnucl):
595	out.write("%d %22.15le %22.15le %22.15le %22.15le %22.15le %22.15le\n" \
596		  % (i+1,0.0,0.0,0.0,0.0,0.0,0.0))
597
598    out.write('\n')
599    out.write('# Atom-ID, shape, quaternion\n')
600    out.write('Ellipsoids\n')
601    out.write('\n')
602
603    for i in xrange(nnucl):
604	out.write(\
605    "%d %22.15le %22.15le %22.15le %22.15le %22.15le %22.15le %22.15le\n"  \
606      % (i+1,1.1739845031423408,1.1739845031423408,1.1739845031423408, \
607	quaternions[i][0],quaternions[i][1], quaternions[i][2],quaternions[i][3]))
608
609    out.write('\n')
610    out.write('# Bond topology\n')
611    out.write('Bonds\n')
612    out.write('\n')
613
614    for i in xrange(nbonds):
615	out.write("%d  %d  %d  %d\n" % (i+1,1,bonds[i][0],bonds[i][1]))
616
617    out.close()
618
619    print >> sys.stdout, "## Wrote data to 'data.oxdna'"
620    print >> sys.stdout, "## DONE"
621
622# call the above main() function, which executes the program
623read_strands (infile)
624
625end_time=timer()
626runtime = end_time-start_time
627hours = runtime/3600
628minutes = (runtime-np.rint(hours)*3600)/60
629seconds = (runtime-np.rint(hours)*3600-np.rint(minutes)*60)%60
630print >> sys.stdout, "## Total runtime %ih:%im:%.2fs" % (hours,minutes,seconds)
631