1  /*==========================================================================
2              RazerS - Fast Read Mapping with Controlled Loss Rate
3                    http://www.seqan.de/projects/razers.html
4 
5  ============================================================================
6   Copyright (C) 2008 by Anne-Katrin Emde
7 
8   This program is free software; you can redistribute it and/or
9   modify it under the terms of the GNU Lesser General Public
10   License as published by the Free Software Foundation; either
11   version 3 of the License, or (at your options) any later version.
12 
13   This program is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program.  If not, see <http://www.gnu.org/licenses/>.
20  ==========================================================================*/
21 
22 #ifndef SEQAN_HEADER_READ_SIMULATOR_H
23 #define SEQAN_HEADER_READ_SIMULATOR_H
24 
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28 #include <algorithm>
29 
30 #include <seqan/sequence.h>
31 #include <seqan/find.h>
32 #include <seqan/modifier.h>
33 
34 
35 namespace seqan
36 {
37 
38 #define TEMP_RAND_MAX (RAND_MAX + 1.0)
39 
40 template <typename TOperation, typename TAlphabet>
41 inline TAlphabet
sample(TOperation m,TAlphabet base)42 sample(TOperation m, TAlphabet base)
43 {
44 	Dna ret;
45 	do
46 		ret = (Dna)(int)((double)ValueSize<Dna>::VALUE * (double)rand() / TEMP_RAND_MAX);
47 	while (ret == base && m == SEQAN_MISMATCH);
48 	return ret;
49 }
50 
51 template < typename TGenome >
simulateGenome(TGenome & genome,int size)52 void simulateGenome(TGenome &genome, int size)
53 {
54 //	mtRandInit();
55 	resize(genome, size);
56 	for(int i = 0; i < size; ++i)
57 		genome[i] = sample(0, (Dna)0);
58 }
59 
60 
61 template<typename TPosString>
62 void
fillupStartpos(TPosString & sortedStartPos,int numReads,int readLength,int maxErrors,int libSize,int libError,int seqLength,double forwardProb)63 fillupStartpos(TPosString & sortedStartPos,
64 	int numReads,
65 	int readLength,
66 	int maxErrors,			// how many errors they may have
67 	int libSize,			// library size, 0 disables mate-pair simulation
68 	int libError,
69 	int seqLength,			// maximal library size deviation
70 	double forwardProb)
71 {
72 
73 	const int REVCOMP = 1 << (sizeof(int)*8-1);
74 	resize(sortedStartPos, numReads);
75 
76 	int fragmentSize = readLength;
77 	if (libSize > 0)
78 		fragmentSize = libSize + libError;
79 
80 
81 	// sample positions
82 	for (int i = 0; i < numReads; ++i)
83 		sortedStartPos[i] = (int)((seqLength - fragmentSize - maxErrors + 1.0) * rand() / TEMP_RAND_MAX);
84 
85 	std::sort(begin(sortedStartPos),end(sortedStartPos));
86 
87 	// sample orientations
88 	for (int i = 0; i < numReads; ++i)
89 		if ((double)rand() / TEMP_RAND_MAX >= forwardProb)
90 			sortedStartPos[i] |= REVCOMP;
91 
92 	if (libSize > 0)
93 	{
94 		resize(sortedStartPos,2*numReads);
95 		// sample mate-pair positions and inverse orientations
96 		for(int i=0;i<numReads;++i)
97 		{
98 			int leftPos = sortedStartPos[i] & ~REVCOMP;
99 			int lSize = fragmentSize - (int)((2.0 * libError + 1.0) * (double)rand() / TEMP_RAND_MAX);
100 			int rightPos = leftPos + lSize - readLength;
101 			if ((sortedStartPos[i] & REVCOMP) == 0)
102 			{
103 				sortedStartPos[i+numReads] = rightPos | REVCOMP;
104 			}
105 			else
106 			{
107 				sortedStartPos[i] = rightPos | REVCOMP;
108 				sortedStartPos[i+numReads] = leftPos;
109 			}
110 		}
111 		numReads*=2;
112 	}
113 
114 
115 }
116 
117 
118 
119 //////////
120 // Simulates a set of reads from a set of haplotypes with a certain error distribution
121 //////////
122 template <
123 	typename TReadSet,
124 	typename TReadIDs,
125 	typename TGenomeSet,
126 	typename TDistr >
127 void simulateReads(
128 	TReadSet &readSet,		// generated read sequences
129 	TReadIDs &readIDs,		// corresponding Fasta ids
130 	TGenomeSet &genomeSet,	// source genome sequences
131 	int numReads,			// how many reads should be generated
132 	int maxErrors,			// how many errors they may have
133 	TDistr &errorDist,		// error probability distribution
134 	int libSize,			// library size, 0 disables mate-pair simulation
135 	int libError,			// maximal library size deviation
136 	double forwardProb,
137 	bool verbose = false)
138 {
139 	//typedef typename Value<TReadSet>::Type				TRead;
140 	//typedef typename Value<TGenomeSet>::Type			TGenome;
141 	//typedef typename Infix<TGenome>::Type				TGenomeInfix;
142 	//typedef ModifiedString<TGenomeInfix, ModReverse>	TGenomeInfixRev;
143 	//typedef ModifiedString<TRead, ModReverse>			TReadRev;
144 
145 	//typedef Finder<TGenomeInfix>						TMyersFinder;
146 	//typedef Pattern<TRead, MyersUkkonen>				TMyersPattern;
147 
148 //	typedef Finder<TGenomeInfix>						TMyersFinderRev;
149 	//typedef Finder<TGenomeInfixRev>						TMyersFinderRev;
150 	//typedef Pattern<TReadRev, MyersUkkonenGlobal>		TMyersPatternRev;
151 
152 //	mtRandInit();
153 //	typedef TGenome TRevComp;
154 
155 	int readLength = length(errorDist)/4;
156 	const int REVCOMP = 1 << (sizeof(int)*8-1);
157 	//int KJ = 2*maxErrors;
158 
159 	srand ( time(NULL) );
160 
161 	String<int> bucketCounter;
162 	resize(bucketCounter,maxErrors,0);
163 
164 	String<int> kickOutCount;
165 	resize(kickOutCount,maxErrors,0);
166 
167 	if (verbose)
168 		std::cout << "\nSimulating...";
169 
170 	String<int> modificationPattern;
171 	reserve(modificationPattern, readLength + maxErrors);
172 	int inValidModPat = 0;
173 
174 	//at the moment: nur eine source sequenz --> nimm immer genomeSet[0]
175 	TGenome& currentSource = genomeSet[0];
176 	int seqLength = length(currentSource);
177 
178 	int fragmentSize = readLength;
179 	if (libSize > 0)
180 		fragmentSize = libSize + libError;
181 
182 	String<int> sortedStartPos;
183 /*				# Pick a library size
184 				currentLibrary = sample(1:(length(librarySizes)), 1)
185 				lSize = round(rnorm(1, mean=librarySizes[currentLibrary], sd=librarySd[currentLibrary]))
186 				if (start < end) {
187 					if (start + libSize <= seqLength) {
188 						startMatePair = start + libSize - readLength + 1
189 						endMatePair = startMatePair + readLength - 1
190 						readMate = sourceSeq[startMatePair:endMatePair]
191 						readMate = reverseComplement(readMate)
192 						tmp = startMatePair
193 						startMatePair = endMatePair
194 						endMatePair = tmp
195 						invalidMatePair = 0
196 					}
197 				} else {
198 					if (start - libSize >= 1) {
199 						startMatePair = start - libSize
200 						endMatePair = startMatePair + readLength - 1
201 						readMate = sourceSeq[startMatePair:endMatePair]
202 						invalidMatePair = 0
203 					}
204 				}*/
205 
206 	int realNumReads = numReads;
207 	unsigned int samplePosCounter = 0;
208 	int readCounter = 0;
209 	while (readCounter < numReads) {
210 		clear(modificationPattern);
211 		//# Pick a haplotype
212 		//currentHaplotype = sample(1:(length(haplotypes)), 1)
213 
214 		// Sample a read
215 		if(samplePosCounter >= length(sortedStartPos))//get new start positions
216 		{
217 			fillupStartpos(sortedStartPos, numReads, readLength, maxErrors,	libSize, libError, seqLength, forwardProb);
218 			samplePosCounter = 0;
219 		}
220 		int  startPos = sortedStartPos[samplePosCounter] & ~REVCOMP;
221 		bool revComp  = (sortedStartPos[samplePosCounter] & REVCOMP) != 0;
222 		int  maxEnd   = startPos + readLength + maxErrors - 1;
223 
224 		TGenome read;
225 		resize(read,readLength);
226 
227 		TGenome readTemplate;// = infix(currentSource,startPos,maxEnd);
228 		resize(readTemplate,maxEnd-startPos);
229 		arrayCopy(iter(currentSource,startPos), iter(currentSource,maxEnd), begin(readTemplate)); //infix(currentSource,startPos,maxEnd);
230 
231 		if(revComp) reverseComplement(readTemplate);
232 
233 		int lastOp = 0;
234 		int currOp = 0;
235 		// Sequence the reads
236 		int countErrors = 0;
237 		int pos = 0;
238 		int trueLength = 0;
239 		bool successful = false;
240 
241 		while(pos < maxEnd) {
242 			lastOp = currOp;
243 			double prob = (double)rand()/TEMP_RAND_MAX;
244 			//	std::cout << "prob = " << prob << "\t";
245 			int m;
246 			for(m = 0; m < 4; ++m)
247 			{
248 				double modProb = _transformBack(errorDist[m*readLength + trueLength]);
249 				if (prob < modProb)
250 				{
251 					currOp = m;
252 					break;
253 				}
254 				prob -= modProb;
255 			}
256 			if (m == 4) std::cout << "HUH?";
257 		//	std::cout << "operation = " << operation << "\t";
258 			if(pos==0 &&  currOp == SEQAN_INSERT)// (currOp==SEQAN_DELETE || currOp == SEQAN_INSERT))
259 			{
260 				currOp = 0;
261 				continue;
262 			}
263 			appendValue(modificationPattern,currOp,Generous());
264 
265 			// ignore reads with Ns
266 			if (currOp != SEQAN_INSERT && readTemplate[pos] == 'N')
267 				countErrors = maxErrors + 1;
268 
269 			// Insert Delete is the same as Delete Insert and both are the same as Mismatch (or match)
270 			if(currOp == SEQAN_MATCH) read[trueLength] = readTemplate[pos];
271 			else
272 			{
273 				++countErrors;
274 				if(currOp != SEQAN_INSERT)
275 					read[trueLength] = sample(currOp,readTemplate[pos]);
276 			}
277 			if(currOp != SEQAN_INSERT) ++trueLength; //if read nucleotide is not deleted
278 			if(currOp != SEQAN_DELETE) ++pos; //if read nucleotide is not an insert
279 //			if((lastOp==SEQAN_DELETE && currOp==SEQAN_INSERT) || (currOp==SEQAN_DELETE && lastOp==SEQAN_INSERT))
280 //			{
281 //				--countErrors;
282 //				currOp = SEQAN_MISMATCH;
283 //			//	std::cout << "ID=DI=M\n";
284 //			}
285 		//	std::cout << "true len = " << trueLength << std::endl;
286 			if(trueLength == readLength || countErrors >= maxErrors)
287 			{
288 				if(countErrors < maxErrors)// && currOp != SEQAN_INSERT)
289 					successful = true;
290 				break;
291 			}
292 		}
293 
294 	/*
295 		if(successful)
296 		{
297 			int patLen = length(modificationPattern);
298 			int err = 0, del = 0;
299 			for (int j = 0; j < KJ; ++j)
300 			{
301 				switch (modificationPattern[j]) {
302 					case SEQAN_MATCH:
303 						++del;
304 						break;
305 
306 					case SEQAN_DELETE:
307 						++del;
308 						SEQAN_FALLTHROUGH
309 
310 					case SEQAN_INSERT:
311 						++err;
312 						break;
313 
314 					default:;
315 				}
316 				if (del > 0 && del <= err)
317 				{
318 					successful = false;
319 					++inValidModPat;
320 					break;
321 				}
322 			}
323 			if(successful)
324 			{
325 				err = del = 0;
326 				for (int j = patLen - 1; j >= patLen - KJ; --j)
327 				{
328 					switch (modificationPattern[j]) {
329 						case SEQAN_MATCH:
330 							++del;
331 							break;
332 
333 						case SEQAN_DELETE:
334 							++del;
335 							SEQAN_FALLTHROUGH
336 
337 						case SEQAN_INSERT:
338 							++err;
339 							break;
340 
341 						default:;
342 					}
343 					if (del > 0 && del <= err)
344 					{
345 						successful = false;
346 						++inValidModPat;
347 						break;
348 					}
349 				}
350 			}
351 		}
352 		*/
353 
354 /*		countMateErrors = 0
355 		if (simulateMatePairs == 1) {
356 			for(pos in 1:length(readMate)) {
357 				if (runif(1) <= _transformBack(errorDist[pos])) {
358 					readMate[pos] = sample(alphabet[alphabet!= readMate[pos]], 1)
359 					countMateErrors = countMateErrors + 1
360 				}
361 			}
362 		}*/
363 
364 		if(successful)
365 		{
366 			//verify that number of errors is correct
367 			bool kickOut = false;
368 /*			int start1 = startPos;
369 			int maxEnd1 = maxEnd;
370 			while(start1 > 0 && (startPos - start1) < countErrors) --start1;
371 			while(maxEnd1 > 0 && (maxEnd1 - maxEnd) < countErrors) ++maxEnd1;
372 			TGenomeInfix genomeInfix(currentSource,start1,maxEnd1);
373 			TMyersFinder myersFinder(genomeInfix);
374 
375 			// init forward verifiers
376 			if(revComp) reverseComplement(read);
377 			TMyersPattern forwardPattern(read);
378 			TMyersPattern &myersPattern = forwardPattern;
379 
380 			// find end of best semi-global alignment
381 			int maxScore = std::numeric_limits<int>::min();
382 			int minScore = -(int)countErrors;
383 			TMyersFinder maxPos;
384 			while (find(myersFinder, myersPattern, minScore))
385 				if (maxScore < getScore(myersPattern))
386 				{
387 					maxScore = getScore(myersPattern);
388 					maxPos = myersFinder;
389 				}
390 
391 			if (maxScore >= minScore)
392 			{
393 				TGenomeInfixRev		infRev(infix(currentSource, start1, start1+position(maxPos)+1));
394 				TReadRev		readRev(read);
395 				TMyersFinderRev		myersFinderRev(infRev);
396 				TMyersPatternRev	myersPatternRev(readRev);
397 				// find beginning of best semi-global alignment
398 				if (find(myersFinderRev, myersPatternRev, maxScore))
399 					start1 = start1 + position(maxPos) - (position(myersFinderRev) + 1);
400 				else {
401 					// this case should never occur
402 					if(revComp) std::cerr <<"reverse\n";
403 					std::cerr << "posMaxpos = " << position(maxPos) << std::endl;
404 					std::cerr << "startPos = " << startPos << std::endl;
405 					std::cerr << "maxEnd   = " << maxEnd << std::endl;
406 					std::cerr << "HUH?\n" << std::endl;
407 					std::cerr << "fGENOME: " << host(myersFinder) << std::endl;
408 					std::cerr << "fREAD:   " << read << std::endl;
409 					std::cerr << "iGENOME: " << infix(currentSource, start1,start1+position(maxPos)+1) << std::endl;
410 					std::cerr << "rGENOME: " << infRev << std::endl;
411 					std::cerr << "rREAD:   " << readRev << std::endl;
412 				}
413 
414 			}
415 			if(revComp) reverseComplement(read);
416 			SEQAN_ASSERT(maxScore >= -(int)countErrors);
417 			if(maxScore != -(int)countErrors)
418 				kickOut = true;*/
419 			if(!kickOut)
420 			{
421 				std::stringstream id;
422 				resize(read,trueLength);
423 				++bucketCounter[countErrors];
424 				++readCounter;
425 				if(verbose && readCounter%10000 == 0)std::cout << readCounter<<"..." << std::flush;
426 				//Add read to readSet
427 				if(!revComp) id << startPos << ',' << startPos+pos;
428 				else id << maxEnd << ',' << maxEnd-pos;
429 				id << "[id=" << readCounter << ",fragId=" << readCounter % realNumReads;
430 				id << ",repeatId=" << 0 <<",errors=" << countErrors;
431 				if (revComp) id << ",orientation=R]";
432 				else id << ",orientation=F]";
433 
434 				appendValue(readIDs, id.str(),Generous());
435 				appendValue(readSet, read, Generous());
436 			}
437 			else ++kickOutCount[countErrors];
438 
439 		}
440 		++samplePosCounter;
441 //		else std::cout << "Not successful\n";
442 	}
443 //	if (simulateMatePairs == 1) {
444 //		print(bucketCounter / (2* numOfReads))
445 //	} else {
446 	if (verbose)
447 	{
448 		std::cout << "\n\nBucket frequencies:\n";
449 		for(unsigned i = 0; i < length(bucketCounter); ++i)
450 			std::cout << (double) bucketCounter[i] / numReads << std::endl;
451 		std::cout << std::endl;
452 		std::cout << "\nBucket kickout count:\n";
453 		for(unsigned i = 0; i < length(kickOutCount); ++i)
454 		{
455 			if((kickOutCount[i] + bucketCounter[i]) > 0) std::cout << (double) kickOutCount[i] / (kickOutCount[i] + bucketCounter[i]) << std::endl;
456 			else std::cout << "0\n";
457 		}
458 		std::cout << std::endl;
459 
460 		std::cout << "\nInvalid modification pattern count: "<<inValidModPat<<std::endl;
461 	}
462 
463 //	if (simulateMatePairs == 1) {
464 //		write(scan(tmpPath, what = 'character'), file=readPath, sep=std::endl, append = TRUE)
465 //		unlink(tmpPath)
466 //	}
467 }
468 
469 }
470 
471 #endif
472