1  /*==========================================================================
2              RazerS - Fast Read Mapping with Controlled Loss Rate
3                    http://www.seqan.de/projects/razers.html
4 
5  ============================================================================
6   Copyright (C) 2008 by Anne-Katrin Emde
7 
8   This program is free software; you can redistribute it and/or
9   modify it under the terms of the GNU Lesser General Public
10   License as published by the Free Software Foundation; either
11   version 3 of the License, or (at your options) any later version.
12 
13   This program is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program.  If not, see <http://www.gnu.org/licenses/>.
20  ==========================================================================*/
21 
22 #ifndef SEQAN_HEADER_READ_SIMULATOR_H
23 #define SEQAN_HEADER_READ_SIMULATOR_H
24 
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28 #include <algorithm>
29 
30 #include <seqan/sequence.h>
31 #include <seqan/find.h>
32 #include <seqan/modifier.h>
33 
34 
35 namespace SEQAN_NAMESPACE_MAIN
36 {
37 
38 #define TEMP_RAND_MAX (RAND_MAX + 1.0)
39 
40 template <typename TOperation, typename TAlphabet>
41 inline TAlphabet
sample(TOperation m,TAlphabet base)42 sample(TOperation m, TAlphabet base)
43 {
44 	Dna ret;
45 	do
46 		ret = (Dna)(int)((double)ValueSize<Dna>::VALUE * (double)rand() / TEMP_RAND_MAX);
47 	while (ret == base && m == SEQAN_MISMATCH);
48 	return ret;
49 }
50 
51 template < typename TGenome >
simulateGenome(TGenome & genome,int size)52 void simulateGenome(TGenome &genome, int size)
53 {
54 //	mtRandInit();
55 	resize(genome, size);
56 	for(int i = 0; i < size; ++i)
57 		genome[i] = sample(0, (Dna)0);
58 }
59 
60 
61 template<typename TPosString>
62 void
fillupStartpos(TPosString & sortedStartPos,int numReads,int readLength,int maxErrors,int libSize,int libError,int seqLength,double forwardProb)63 fillupStartpos(TPosString & sortedStartPos,
64 	int numReads,
65 	int readLength,
66 	int maxErrors,			// how many errors they may have
67 	int libSize,			// library size, 0 disables mate-pair simulation
68 	int libError,
69 	int seqLength,			// maximal library size deviation
70 	double forwardProb)
71 {
72 
73 	const int REVCOMP = 1 << (sizeof(int)*8-1);
74 	resize(sortedStartPos, numReads);
75 
76 	int fragmentSize = readLength;
77 	if (libSize > 0)
78 		fragmentSize = libSize + libError;
79 
80 
81 	// sample positions
82 	for (int i = 0; i < numReads; ++i)
83 		sortedStartPos[i] = (int)((seqLength - fragmentSize - maxErrors + 1.0) * rand() / TEMP_RAND_MAX);
84 
85 	std::sort(begin(sortedStartPos),end(sortedStartPos));
86 
87 	// sample orientations
88 	for (int i = 0; i < numReads; ++i)
89 		if ((double)rand() / TEMP_RAND_MAX >= forwardProb)
90 			sortedStartPos[i] |= REVCOMP;
91 
92 	if (libSize > 0)
93 	{
94 		resize(sortedStartPos,2*numReads);
95 		// sample mate-pair positions and inverse orientations
96 		for(int i=0;i<numReads;++i)
97 		{
98 			int leftPos = sortedStartPos[i] & ~REVCOMP;
99 			int lSize = fragmentSize - (int)((2.0 * libError + 1.0) * (double)rand() / TEMP_RAND_MAX);
100 			int rightPos = leftPos + lSize - readLength;
101 			if ((sortedStartPos[i] & REVCOMP) == 0)
102 			{
103 				sortedStartPos[i+numReads] = rightPos | REVCOMP;
104 			}
105 			else
106 			{
107 				sortedStartPos[i] = rightPos | REVCOMP;
108 				sortedStartPos[i+numReads] = leftPos;
109 			}
110 		}
111 		numReads*=2;
112 	}
113 
114 
115 }
116 
117 
118 
119 //////////
120 // Simulates a set of reads from a set of haplotypes with a certain error distribution
121 //////////
122 template <
123 	typename TReadSet,
124 	typename TReadIDs,
125 	typename TGenomeSet,
126 	typename TDistr >
127 void simulateReads(
128 	TReadSet &readSet,		// generated read sequences
129 	TReadIDs &readIDs,		// corresponding Fasta ids
130 	TGenomeSet &genomeSet,	// source genome sequences
131 	int numReads,			// how many reads should be generated
132 	int maxErrors,			// how many errors they may have
133 	TDistr &errorDist,		// error probability distribution
134 	int libSize,			// library size, 0 disables mate-pair simulation
135 	int libError,			// maximal library size deviation
136 	double forwardProb,
137 	bool verbose = false)
138 {
139 	typedef typename Value<TReadSet>::Type				TRead;
140 	typedef typename Value<TGenomeSet>::Type			TGenome;
141 	typedef typename Infix<TGenome>::Type				TGenomeInfix;
142 	typedef ModifiedString<TGenomeInfix, ModReverse>	TGenomeInfixRev;
143 	typedef ModifiedString<TRead, ModReverse>			TReadRev;
144 
145 	typedef Finder<TGenomeInfix>						TMyersFinder;
146 	typedef Pattern<TRead, MyersUkkonen>				TMyersPattern;
147 
148 //	typedef Finder<TGenomeInfix>						TMyersFinderRev;
149 	typedef Finder<TGenomeInfixRev>						TMyersFinderRev;
150 	typedef Pattern<TReadRev, MyersUkkonenGlobal>		TMyersPatternRev;
151 
152 //	mtRandInit();
153 //	typedef TGenome TRevComp;
154 
155 	int readLength = length(errorDist)/4;
156 	const int REVCOMP = 1 << (sizeof(int)*8-1);
157 	//int KJ = 2*maxErrors;
158 
159 	srand ( time(NULL) );
160 
161 	String<int> bucketCounter;
162 	resize(bucketCounter,maxErrors,0);
163 
164 	String<int> kickOutCount;
165 	resize(kickOutCount,maxErrors,0);
166 
167 	if (verbose)
168 		std::cout << "\nSimulating...";
169 
170 	String<int> modificationPattern;
171 	reserve(modificationPattern, readLength + maxErrors);
172 	int inValidModPat = 0;
173 
174 	//at the moment: nur eine source sequenz --> nimm immer genomeSet[0]
175 	TGenome& currentSource = genomeSet[0];
176 	int seqLength = length(currentSource);
177 
178 	int fragmentSize = readLength;
179 	if (libSize > 0)
180 		fragmentSize = libSize + libError;
181 
182 	String<int> sortedStartPos;
183 /*				# Pick a library size
184 				currentLibrary = sample(1:(length(librarySizes)), 1)
185 				lSize = round(rnorm(1, mean=librarySizes[currentLibrary], sd=librarySd[currentLibrary]))
186 				if (start < end) {
187 					if (start + libSize <= seqLength) {
188 						startMatePair = start + libSize - readLength + 1
189 						endMatePair = startMatePair + readLength - 1
190 						readMate = sourceSeq[startMatePair:endMatePair]
191 						readMate = reverseComplement(readMate)
192 						tmp = startMatePair
193 						startMatePair = endMatePair
194 						endMatePair = tmp
195 						invalidMatePair = 0
196 					}
197 				} else {
198 					if (start - libSize >= 1) {
199 						startMatePair = start - libSize
200 						endMatePair = startMatePair + readLength - 1
201 						readMate = sourceSeq[startMatePair:endMatePair]
202 						invalidMatePair = 0
203 					}
204 				}*/
205 
206 	int realNumReads = numReads;
207 	unsigned int samplePosCounter = 0;
208 	int readCounter = 0;
209 	while (readCounter < numReads) {
210 		clear(modificationPattern);
211 		//# Pick a haplotype
212 		//currentHaplotype = sample(1:(length(haplotypes)), 1)
213 
214 		// Sample a read
215 		if(samplePosCounter >= length(sortedStartPos))//get new start positions
216 		{
217 			fillupStartpos(sortedStartPos, numReads, readLength, maxErrors,	libSize, libError, seqLength, forwardProb);
218 			samplePosCounter = 0;
219 		}
220 		int  startPos = sortedStartPos[samplePosCounter] & ~REVCOMP;
221 		bool revComp  = (sortedStartPos[samplePosCounter] & REVCOMP) != 0;
222 		int  maxEnd   = startPos + readLength + maxErrors - 1;
223 
224 		TGenome read;
225 		resize(read,readLength);
226 
227 		TGenome readTemplate;// = infix(currentSource,startPos,maxEnd);
228 		resize(readTemplate,maxEnd-startPos);
229 		arrayCopy(iter(currentSource,startPos), iter(currentSource,maxEnd), begin(readTemplate)); //infix(currentSource,startPos,maxEnd);
230 
231 		if(revComp) reverseComplement(readTemplate);
232 
233 		int lastOp = 0;
234 		int currOp = 0;
235 		// Sequence the reads
236 		int countErrors = 0;
237 		int pos = 0;
238 		int trueLength = 0;
239 		bool successful = false;
240 
241 		while(pos < maxEnd) {
242 			lastOp = currOp;
243 			double prob = (double)rand()/TEMP_RAND_MAX;
244 			//	std::cout << "prob = " << prob << "\t";
245 			int m;
246 			for(m = 0; m < 4; ++m)
247 			{
248 				double modProb = _transformBack(errorDist[m*readLength + trueLength]);
249 				if (prob < modProb)
250 				{
251 					currOp = m;
252 					break;
253 				}
254 				prob -= modProb;
255 			}
256 			if (m == 4) std::cout << "HUH?";
257 		//	std::cout << "operation = " << operation << "\t";
258 			if(pos==0 &&  currOp == SEQAN_INSERT)// (currOp==SEQAN_DELETE || currOp == SEQAN_INSERT))
259 			{
260 				currOp = 0;
261 				continue;
262 			}
263 			appendValue(modificationPattern,currOp,Generous());
264 
265 			// ignore reads with Ns
266 			if (currOp != SEQAN_INSERT && readTemplate[pos] == 'N')
267 				countErrors = maxErrors + 1;
268 
269 			// Insert Delete is the same as Delete Insert and both are the same as Mismatch (or match)
270 			if(currOp == SEQAN_MATCH) read[trueLength] = readTemplate[pos];
271 			else
272 			{
273 				++countErrors;
274 				if(currOp != SEQAN_INSERT)
275 					read[trueLength] = sample(currOp,readTemplate[pos]);
276 			}
277 			if(currOp != SEQAN_INSERT) ++trueLength; //if read nucleotide is not deleted
278 			if(currOp != SEQAN_DELETE) ++pos; //if read nucleotide is not an insert
279 //			if((lastOp==SEQAN_DELETE && currOp==SEQAN_INSERT) || (currOp==SEQAN_DELETE && lastOp==SEQAN_INSERT))
280 //			{
281 //				--countErrors;
282 //				currOp = SEQAN_MISMATCH;
283 //			//	std::cout << "ID=DI=M\n";
284 //			}
285 		//	std::cout << "true len = " << trueLength << std::endl;
286 			if(trueLength == readLength || countErrors >= maxErrors)
287 			{
288 				if(countErrors < maxErrors)// && currOp != SEQAN_INSERT)
289 					successful = true;
290 				break;
291 			}
292 		}
293 
294 	/*
295 		if(successful)
296 		{
297 			int patLen = length(modificationPattern);
298 			int err = 0, del = 0;
299 			for (int j = 0; j < KJ; ++j)
300 			{
301 				switch (modificationPattern[j]) {
302 					case SEQAN_MATCH:
303 						++del;
304 						break;
305 
306 					case SEQAN_DELETE:
307 						++del;
308 
309 					case SEQAN_INSERT:
310 						++err;
311 						break;
312 
313 					default:;
314 				}
315 				if (del > 0 && del <= err)
316 				{
317 					successful = false;
318 					++inValidModPat;
319 					break;
320 				}
321 			}
322 			if(successful)
323 			{
324 				err = del = 0;
325 				for (int j = patLen - 1; j >= patLen - KJ; --j)
326 				{
327 					switch (modificationPattern[j]) {
328 						case SEQAN_MATCH:
329 							++del;
330 							break;
331 
332 						case SEQAN_DELETE:
333 							++del;
334 
335 						case SEQAN_INSERT:
336 							++err;
337 							break;
338 
339 						default:;
340 					}
341 					if (del > 0 && del <= err)
342 					{
343 						successful = false;
344 						++inValidModPat;
345 						break;
346 					}
347 				}
348 			}
349 		}
350 		*/
351 
352 /*		countMateErrors = 0
353 		if (simulateMatePairs == 1) {
354 			for(pos in 1:length(readMate)) {
355 				if (runif(1) <= _transformBack(errorDist[pos])) {
356 					readMate[pos] = sample(alphabet[alphabet!= readMate[pos]], 1)
357 					countMateErrors = countMateErrors + 1
358 				}
359 			}
360 		}*/
361 
362 		if(successful)
363 		{
364 			//verify that number of errors is correct
365 			bool kickOut = false;
366 /*			int start1 = startPos;
367 			int maxEnd1 = maxEnd;
368 			while(start1 > 0 && (startPos - start1) < countErrors) --start1;
369 			while(maxEnd1 > 0 && (maxEnd1 - maxEnd) < countErrors) ++maxEnd1;
370 			TGenomeInfix genomeInfix(currentSource,start1,maxEnd1);
371 			TMyersFinder myersFinder(genomeInfix);
372 
373 			// init forward verifiers
374 			if(revComp) reverseComplement(read);
375 			TMyersPattern forwardPattern(read);
376 			TMyersPattern &myersPattern = forwardPattern;
377 
378 			// find end of best semi-global alignment
379 			int maxScore = MinValue<int>::VALUE;
380 			int minScore = -(int)countErrors;
381 			TMyersFinder maxPos;
382 			while (find(myersFinder, myersPattern, minScore))
383 				if (maxScore < getScore(myersPattern))
384 				{
385 					maxScore = getScore(myersPattern);
386 					maxPos = myersFinder;
387 				}
388 
389 			if (maxScore >= minScore)
390 			{
391 				TGenomeInfixRev		infRev(infix(currentSource, start1, start1+position(maxPos)+1));
392 				TReadRev		readRev(read);
393 				TMyersFinderRev		myersFinderRev(infRev);
394 				TMyersPatternRev	myersPatternRev(readRev);
395 				// find beginning of best semi-global alignment
396 				if (find(myersFinderRev, myersPatternRev, maxScore))
397 					start1 = start1 + position(maxPos) - (position(myersFinderRev) + 1);
398 				else {
399 					// this case should never occur
400 					if(revComp) std::cerr <<"reverse\n";
401 					std::cerr << "posMaxpos = " << position(maxPos) << std::endl;
402 					std::cerr << "startPos = " << startPos << std::endl;
403 					std::cerr << "maxEnd   = " << maxEnd << std::endl;
404 					std::cerr << "HUH?\n" << std::endl;
405 					std::cerr << "fGENOME: " << host(myersFinder) << std::endl;
406 					std::cerr << "fREAD:   " << read << std::endl;
407 					std::cerr << "iGENOME: " << infix(currentSource, start1,start1+position(maxPos)+1) << std::endl;
408 					std::cerr << "rGENOME: " << infRev << std::endl;
409 					std::cerr << "rREAD:   " << readRev << std::endl;
410 				}
411 
412 			}
413 			if(revComp) reverseComplement(read);
414 			SEQAN_ASSERT_TRUE(maxScore >= -(int)countErrors);
415 			if(maxScore != -(int)countErrors)
416 				kickOut = true;*/
417 			if(!kickOut)
418 			{
419 				std::stringstream id;
420 				resize(read,trueLength);
421 				++bucketCounter[countErrors];
422 				++readCounter;
423 				if(verbose && readCounter%10000 == 0)std::cout << readCounter<<"..." << std::flush;
424 				//Add read to readSet
425 				if(!revComp) id << startPos << ',' << startPos+pos;
426 				else id << maxEnd << ',' << maxEnd-pos;
427 				id << "[id=" << readCounter << ",fragId=" << readCounter % realNumReads;
428 				id << ",repeatId=" << 0 <<",errors=" << countErrors;
429 				if (revComp) id << ",orientation=R]";
430 				else id << ",orientation=F]";
431 
432 				appendValue(readIDs, id.str(),Generous());
433 				appendValue(readSet, read, Generous());
434 			}
435 			else ++kickOutCount[countErrors];
436 
437 		}
438 		++samplePosCounter;
439 //		else std::cout << "Not successful\n";
440 	}
441 //	if (simulateMatePairs == 1) {
442 //		print(bucketCounter / (2* numOfReads))
443 //	} else {
444 	if (verbose)
445 	{
446 		std::cout << "\n\nBucket frequencies:\n";
447 		for(unsigned i = 0; i < length(bucketCounter); ++i)
448 			std::cout << (double) bucketCounter[i] / numReads << std::endl;
449 		std::cout << std::endl;
450 		std::cout << "\nBucket kickout count:\n";
451 		for(unsigned i = 0; i < length(kickOutCount); ++i)
452 		{
453 			if((kickOutCount[i] + bucketCounter[i]) > 0) std::cout << (double) kickOutCount[i] / (kickOutCount[i] + bucketCounter[i]) << std::endl;
454 			else std::cout << "0\n";
455 		}
456 		std::cout << std::endl;
457 
458 		std::cout << "\nInvalid modification pattern count: "<<inValidModPat<<std::endl;
459 	}
460 
461 //	if (simulateMatePairs == 1) {
462 //		write(scan(tmpPath, what = 'character'), file=readPath, sep=std::endl, append = TRUE)
463 //		unlink(tmpPath)
464 //	}
465 }
466 
467 }
468 
469 #endif
470