1  /*==========================================================================
2              RazerS - Fast Read Mapping with Controlled Loss Rate
3                    http://www.seqan.de/projects/razers.html
4 
5  ============================================================================
6   Copyright (C) 2008 by Anne-Katrin Emde
7 
8   This program is free software; you can redistribute it and/or
9   modify it under the terms of the GNU Lesser General Public
10   License as published by the Free Software Foundation; either
11   version 3 of the License, or (at your options) any later version.
12 
13   This program is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program.  If not, see <http://www.gnu.org/licenses/>.
20  ==========================================================================*/
21 
22 #ifndef SEQAN_HEADER_READ_SIMULATOR_H
23 #define SEQAN_HEADER_READ_SIMULATOR_H
24 
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28 #include <algorithm>
29 #include <random>
30 
31 #include <seqan/sequence.h>
32 #include <seqan/find.h>
33 #include <seqan/modifier.h>
34 
35 
36 namespace seqan
37 {
38 
39 template <typename TOperation, typename TAlphabet>
40 inline TAlphabet
sample(std::mt19937 & rng,TOperation m,TAlphabet base)41 sample(std::mt19937 & rng, TOperation m, TAlphabet base)
42 {
43     std::uniform_int_distribution<int> dist(0, 2);
44 
45     int ret = Dna(dist(rng));
46     if (m == SEQAN_MISMATCH && ret >= ordValue(base))
47         ret += 1;
48     return Dna(ret);
49 }
50 
51 template < typename TGenome >
simulateGenome(std::mt19937 & rng,TGenome & genome,int size)52 void simulateGenome(std::mt19937 & rng,
53                     TGenome &genome, int size)
54 {
55 //	mtRandInit();
56 	resize(genome, size);
57 	for(int i = 0; i < size; ++i)
58 		genome[i] = sample(rng, 0, (Dna)0);
59 }
60 
61 
62 template<typename TPosString>
63 void
fillupStartpos(std::mt19937 & rng,TPosString & sortedStartPos,int numReads,int readLength,int maxErrors,int libSize,int libError,int seqLength,double forwardProb)64 fillupStartpos(
65     std::mt19937 & rng,
66     TPosString & sortedStartPos,
67 	int numReads,
68 	int readLength,
69 	int maxErrors,			// how many errors they may have
70 	int libSize,			// library size, 0 disables mate-pair simulation
71 	int libError,
72 	int seqLength,			// maximal library size deviation
73 	double forwardProb)
74 {
75 
76 	const int REVCOMP = 1 << (sizeof(int)*8-1);
77 	resize(sortedStartPos, numReads);
78 
79 	int fragmentSize = readLength;
80 	if (libSize > 0)
81 		fragmentSize = libSize + libError;
82 
83 
84 	// sample positions
85 	std::uniform_real_distribution<double> dist(0, 1);
86 	for (int i = 0; i < numReads; ++i)
87 		sortedStartPos[i] = (int)((seqLength - fragmentSize - maxErrors + 1.0) * dist(rng));
88 
89 	std::sort(begin(sortedStartPos),end(sortedStartPos));
90 
91 	// sample orientations
92 	for (int i = 0; i < numReads; ++i)
93 	    if (dist(rng) >= forwardProb)
94 			sortedStartPos[i] |= REVCOMP;
95 
96 	if (libSize > 0)
97 	{
98 		resize(sortedStartPos,2*numReads);
99 		// sample mate-pair positions and inverse orientations
100 		for(int i=0;i<numReads;++i)
101 		{
102 			int leftPos = sortedStartPos[i] & ~REVCOMP;
103 			int lSize = fragmentSize - (int)((2.0 * libError + 1.0) * dist(rng));
104 			int rightPos = leftPos + lSize - readLength;
105 			if ((sortedStartPos[i] & REVCOMP) == 0)
106 			{
107 				sortedStartPos[i+numReads] = rightPos | REVCOMP;
108 			}
109 			else
110 			{
111 				sortedStartPos[i] = rightPos | REVCOMP;
112 				sortedStartPos[i+numReads] = leftPos;
113 			}
114 		}
115 		numReads*=2;
116 	}
117 }
118 
119 
120 
121 //////////
122 // Simulates a set of reads from a set of haplotypes with a certain error distribution
123 //////////
124 template <
125 	typename TReadSet,
126 	typename TReadIDs,
127 	typename TGenomeSet,
128 	typename TDistr >
129 void simulateReads(
130             std::mt19937 & rng,
131 	TReadSet &readSet,		// generated read sequences
132 	TReadIDs &readIDs,		// corresponding Fasta ids
133 	TGenomeSet &genomeSet,	// source genome sequences
134 	int numReads,			// how many reads should be generated
135 	int maxErrors,			// how many errors they may have
136 	TDistr &errorDist,		// error probability distribution
137 	int libSize,			// library size, 0 disables mate-pair simulation
138 	int libError,			// maximal library size deviation
139 	double forwardProb,
140 	bool verbose = false)
141 {
142 	//typedef typename Value<TReadSet>::Type				TRead;
143 	//typedef typename Value<TGenomeSet>::Type			TGenome;
144 	//typedef typename Infix<TGenome>::Type				TGenomeInfix;
145 	//typedef ModifiedString<TGenomeInfix, ModReverse>	TGenomeInfixRev;
146 	//typedef ModifiedString<TRead, ModReverse>			TReadRev;
147 
148 	//typedef Finder<TGenomeInfix>						TMyersFinder;
149 	//typedef Pattern<TRead, MyersUkkonen>				TMyersPattern;
150 
151 //	typedef Finder<TGenomeInfix>						TMyersFinderRev;
152 	//typedef Finder<TGenomeInfixRev>						TMyersFinderRev;
153 	//typedef Pattern<TReadRev, MyersUkkonenGlobal>		TMyersPatternRev;
154 
155 //	mtRandInit();
156 //	typedef TGenome TRevComp;
157 
158 	int readLength = length(errorDist)/4;
159 	const int REVCOMP = 1 << (sizeof(int)*8-1);
160 	//int KJ = 2*maxErrors;
161 
162 	String<int> bucketCounter;
163 	resize(bucketCounter,maxErrors,0);
164 
165 	String<int> kickOutCount;
166 	resize(kickOutCount,maxErrors,0);
167 
168 	if (verbose)
169 		std::cout << "\nSimulating...";
170 
171 	String<int> modificationPattern;
172 	reserve(modificationPattern, readLength + maxErrors);
173 	int inValidModPat = 0;
174 
175 	//at the moment: nur eine source sequenz --> nimm immer genomeSet[0]
176 	TGenome& currentSource = genomeSet[0];
177 	int seqLength = length(currentSource);
178 
179 	// int fragmentSize = readLength;
180 	// if (libSize > 0)
181 	// 	fragmentSize = libSize + libError;
182 
183 	String<int> sortedStartPos;
184 /*				# Pick a library size
185 				currentLibrary = sample(1:(length(librarySizes)), 1)
186 				lSize = round(rnorm(1, mean=librarySizes[currentLibrary], sd=librarySd[currentLibrary]))
187 				if (start < end) {
188 					if (start + libSize <= seqLength) {
189 						startMatePair = start + libSize - readLength + 1
190 						endMatePair = startMatePair + readLength - 1
191 						readMate = sourceSeq[startMatePair:endMatePair]
192 						readMate = reverseComplement(readMate)
193 						tmp = startMatePair
194 						startMatePair = endMatePair
195 						endMatePair = tmp
196 						invalidMatePair = 0
197 					}
198 				} else {
199 					if (start - libSize >= 1) {
200 						startMatePair = start - libSize
201 						endMatePair = startMatePair + readLength - 1
202 						readMate = sourceSeq[startMatePair:endMatePair]
203 						invalidMatePair = 0
204 					}
205 				}*/
206 
207 	int realNumReads = numReads;
208 	unsigned int samplePosCounter = 0;
209 	int readCounter = 0;
210 	while (readCounter < numReads) {
211 		clear(modificationPattern);
212 		//# Pick a haplotype
213 		//currentHaplotype = sample(1:(length(haplotypes)), 1)
214 
215 		// Sample a read
216 		if(samplePosCounter >= length(sortedStartPos))//get new start positions
217 		{
218 			fillupStartpos(rng, sortedStartPos, numReads, readLength, maxErrors,	libSize, libError, seqLength, forwardProb);
219 			samplePosCounter = 0;
220 		}
221 		int  startPos = sortedStartPos[samplePosCounter] & ~REVCOMP;
222 		bool revComp  = (sortedStartPos[samplePosCounter] & REVCOMP) != 0;
223 		int  maxEnd   = startPos + readLength + maxErrors - 1;
224 
225 		TGenome read;
226 		resize(read,readLength);
227 
228 		TGenome readTemplate;// = infix(currentSource,startPos,maxEnd);
229 		resize(readTemplate,maxEnd-startPos);
230 		arrayCopy(iter(currentSource,startPos), iter(currentSource,maxEnd), begin(readTemplate)); //infix(currentSource,startPos,maxEnd);
231 
232 		if(revComp) reverseComplement(readTemplate);
233 
234 		// int lastOp = 0;
235 		int currOp = 0;
236 		// Sequence the reads
237 		int countErrors = 0;
238 		int pos = 0;
239 		int trueLength = 0;
240 		bool successful = false;
241 
242 		std::uniform_real_distribution<double> dist(0, 1);
243 
244 		while(pos < maxEnd) {
245 			// lastOp = currOp;
246 			double prob = dist(rng);
247 			//	std::cout << "prob = " << prob << "\t";
248 			int m;
249 			for(m = 0; m < 4; ++m)
250 			{
251 				double modProb = _transformBack(errorDist[m*readLength + trueLength]);
252 				if (prob < modProb)
253 				{
254 					currOp = m;
255 					break;
256 				}
257 				prob -= modProb;
258 			}
259 			if (m == 4) std::cout << "HUH?";
260 		//	std::cout << "operation = " << operation << "\t";
261 			if(pos==0 &&  currOp == SEQAN_INSERT)// (currOp==SEQAN_DELETE || currOp == SEQAN_INSERT))
262 			{
263 				currOp = 0;
264 				continue;
265 			}
266 			appendValue(modificationPattern,currOp,Generous());
267 
268 			// ignore reads with Ns
269 			if (currOp != SEQAN_INSERT && readTemplate[pos] == 'N')
270 				countErrors = maxErrors + 1;
271 
272 			// Insert Delete is the same as Delete Insert and both are the same as Mismatch (or match)
273 			if(currOp == SEQAN_MATCH) read[trueLength] = readTemplate[pos];
274 			else
275 			{
276 				++countErrors;
277 				if(currOp != SEQAN_INSERT)
278 					read[trueLength] = sample(rng, currOp,readTemplate[pos]);
279 			}
280 			if(currOp != SEQAN_INSERT) ++trueLength; //if read nucleotide is not deleted
281 			if(currOp != SEQAN_DELETE) ++pos; //if read nucleotide is not an insert
282 //			if((lastOp==SEQAN_DELETE && currOp==SEQAN_INSERT) || (currOp==SEQAN_DELETE && lastOp==SEQAN_INSERT))
283 //			{
284 //				--countErrors;
285 //				currOp = SEQAN_MISMATCH;
286 //			//	std::cout << "ID=DI=M\n";
287 //			}
288 		//	std::cout << "true len = " << trueLength << std::endl;
289 			if(trueLength == readLength || countErrors >= maxErrors)
290 			{
291 				if(countErrors < maxErrors)// && currOp != SEQAN_INSERT)
292 					successful = true;
293 				break;
294 			}
295 		}
296 
297 	/*
298 		if(successful)
299 		{
300 			int patLen = length(modificationPattern);
301 			int err = 0, del = 0;
302 			for (int j = 0; j < KJ; ++j)
303 			{
304 				switch (modificationPattern[j]) {
305 					case SEQAN_MATCH:
306 						++del;
307 						break;
308 
309 					case SEQAN_DELETE:
310 						++del;
311 						SEQAN_FALLTHROUGH
312 
313 					case SEQAN_INSERT:
314 						++err;
315 						break;
316 
317 					default:;
318 				}
319 				if (del > 0 && del <= err)
320 				{
321 					successful = false;
322 					++inValidModPat;
323 					break;
324 				}
325 			}
326 			if(successful)
327 			{
328 				err = del = 0;
329 				for (int j = patLen - 1; j >= patLen - KJ; --j)
330 				{
331 					switch (modificationPattern[j]) {
332 						case SEQAN_MATCH:
333 							++del;
334 							break;
335 
336 						case SEQAN_DELETE:
337 							++del;
338 							SEQAN_FALLTHROUGH
339 
340 						case SEQAN_INSERT:
341 							++err;
342 							break;
343 
344 						default:;
345 					}
346 					if (del > 0 && del <= err)
347 					{
348 						successful = false;
349 						++inValidModPat;
350 						break;
351 					}
352 				}
353 			}
354 		}
355 		*/
356 
357 /*		countMateErrors = 0
358 		if (simulateMatePairs == 1) {
359 			for(pos in 1:length(readMate)) {
360 				if (runif(1) <= _transformBack(errorDist[pos])) {
361 					readMate[pos] = sample(alphabet[alphabet!= readMate[pos]], 1)
362 					countMateErrors = countMateErrors + 1
363 				}
364 			}
365 		}*/
366 
367 		if(successful)
368 		{
369 			//verify that number of errors is correct
370 			bool kickOut = false;
371 /*			int start1 = startPos;
372 			int maxEnd1 = maxEnd;
373 			while(start1 > 0 && (startPos - start1) < countErrors) --start1;
374 			while(maxEnd1 > 0 && (maxEnd1 - maxEnd) < countErrors) ++maxEnd1;
375 			TGenomeInfix genomeInfix(currentSource,start1,maxEnd1);
376 			TMyersFinder myersFinder(genomeInfix);
377 
378 			// init forward verifiers
379 			if(revComp) reverseComplement(read);
380 			TMyersPattern forwardPattern(read);
381 			TMyersPattern &myersPattern = forwardPattern;
382 
383 			// find end of best semi-global alignment
384 			int maxScore = std::numeric_limits<int>::min();
385 			int minScore = -(int)countErrors;
386 			TMyersFinder maxPos;
387 			while (find(myersFinder, myersPattern, minScore))
388 				if (maxScore < getScore(myersPattern))
389 				{
390 					maxScore = getScore(myersPattern);
391 					maxPos = myersFinder;
392 				}
393 
394 			if (maxScore >= minScore)
395 			{
396 				TGenomeInfixRev		infRev(infix(currentSource, start1, start1+position(maxPos)+1));
397 				TReadRev		readRev(read);
398 				TMyersFinderRev		myersFinderRev(infRev);
399 				TMyersPatternRev	myersPatternRev(readRev);
400 				// find beginning of best semi-global alignment
401 				if (find(myersFinderRev, myersPatternRev, maxScore))
402 					start1 = start1 + position(maxPos) - (position(myersFinderRev) + 1);
403 				else {
404 					// this case should never occur
405 					if(revComp) std::cerr <<"reverse\n";
406 					std::cerr << "posMaxpos = " << position(maxPos) << std::endl;
407 					std::cerr << "startPos = " << startPos << std::endl;
408 					std::cerr << "maxEnd   = " << maxEnd << std::endl;
409 					std::cerr << "HUH?\n" << std::endl;
410 					std::cerr << "fGENOME: " << host(myersFinder) << std::endl;
411 					std::cerr << "fREAD:   " << read << std::endl;
412 					std::cerr << "iGENOME: " << infix(currentSource, start1,start1+position(maxPos)+1) << std::endl;
413 					std::cerr << "rGENOME: " << infRev << std::endl;
414 					std::cerr << "rREAD:   " << readRev << std::endl;
415 				}
416 
417 			}
418 			if(revComp) reverseComplement(read);
419 			SEQAN_ASSERT(maxScore >= -(int)countErrors);
420 			if(maxScore != -(int)countErrors)
421 				kickOut = true;*/
422 			if(!kickOut)
423 			{
424 				std::stringstream id;
425 				resize(read,trueLength);
426 				++bucketCounter[countErrors];
427 				++readCounter;
428 				if(verbose && readCounter%10000 == 0)std::cout << readCounter<<"..." << std::flush;
429 				//Add read to readSet
430 				if(!revComp) id << startPos << ',' << startPos+pos;
431 				else id << maxEnd << ',' << maxEnd-pos;
432 				id << "[id=" << readCounter << ",fragId=" << readCounter % realNumReads;
433 				id << ",repeatId=" << 0 <<",errors=" << countErrors;
434 				if (revComp) id << ",orientation=R]";
435 				else id << ",orientation=F]";
436 
437 				appendValue(readIDs, id.str(),Generous());
438 				appendValue(readSet, read, Generous());
439 			}
440 			else ++kickOutCount[countErrors];
441 
442 		}
443 		++samplePosCounter;
444 //		else std::cout << "Not successful\n";
445 	}
446 //	if (simulateMatePairs == 1) {
447 //		print(bucketCounter / (2* numOfReads))
448 //	} else {
449 	if (verbose)
450 	{
451 		std::cout << "\n\nBucket frequencies:\n";
452 		for(unsigned i = 0; i < length(bucketCounter); ++i)
453 			std::cout << (double) bucketCounter[i] / numReads << std::endl;
454 		std::cout << std::endl;
455 		std::cout << "\nBucket kickout count:\n";
456 		for(unsigned i = 0; i < length(kickOutCount); ++i)
457 		{
458 			if((kickOutCount[i] + bucketCounter[i]) > 0) std::cout << (double) kickOutCount[i] / (kickOutCount[i] + bucketCounter[i]) << std::endl;
459 			else std::cout << "0\n";
460 		}
461 		std::cout << std::endl;
462 
463 		std::cout << "\nInvalid modification pattern count: "<<inValidModPat<<std::endl;
464 	}
465 
466 //	if (simulateMatePairs == 1) {
467 //		write(scan(tmpPath, what = 'character'), file=readPath, sep=std::endl, append = TRUE)
468 //		unlink(tmpPath)
469 //	}
470 }
471 
472 }
473 
474 #endif
475