1 /*==========================================================================
2 RazerS - Fast Read Mapping with Controlled Loss Rate
3 http://www.seqan.de/projects/razers.html
4
5 ============================================================================
6 Copyright (C) 2008 by Anne-Katrin Emde
7
8 This program is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public
10 License as published by the Free Software Foundation; either
11 version 3 of the License, or (at your options) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ==========================================================================*/
21
22 #ifndef SEQAN_HEADER_READ_SIMULATOR_H
23 #define SEQAN_HEADER_READ_SIMULATOR_H
24
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28 #include <algorithm>
29
30 #include <seqan/sequence.h>
31 #include <seqan/find.h>
32 #include <seqan/modifier.h>
33
34
35 namespace SEQAN_NAMESPACE_MAIN
36 {
37
38 #define TEMP_RAND_MAX (RAND_MAX + 1.0)
39
40 template <typename TOperation, typename TAlphabet>
41 inline TAlphabet
sample(TOperation m,TAlphabet base)42 sample(TOperation m, TAlphabet base)
43 {
44 Dna ret;
45 do
46 ret = (Dna)(int)((double)ValueSize<Dna>::VALUE * (double)rand() / TEMP_RAND_MAX);
47 while (ret == base && m == SEQAN_MISMATCH);
48 return ret;
49 }
50
51 template < typename TGenome >
simulateGenome(TGenome & genome,int size)52 void simulateGenome(TGenome &genome, int size)
53 {
54 // mtRandInit();
55 resize(genome, size);
56 for(int i = 0; i < size; ++i)
57 genome[i] = sample(0, (Dna)0);
58 }
59
60
61 template<typename TPosString>
62 void
fillupStartpos(TPosString & sortedStartPos,int numReads,int readLength,int maxErrors,int libSize,int libError,int seqLength,double forwardProb)63 fillupStartpos(TPosString & sortedStartPos,
64 int numReads,
65 int readLength,
66 int maxErrors, // how many errors they may have
67 int libSize, // library size, 0 disables mate-pair simulation
68 int libError,
69 int seqLength, // maximal library size deviation
70 double forwardProb)
71 {
72
73 const int REVCOMP = 1 << (sizeof(int)*8-1);
74 resize(sortedStartPos, numReads);
75
76 int fragmentSize = readLength;
77 if (libSize > 0)
78 fragmentSize = libSize + libError;
79
80
81 // sample positions
82 for (int i = 0; i < numReads; ++i)
83 sortedStartPos[i] = (int)((seqLength - fragmentSize - maxErrors + 1.0) * rand() / TEMP_RAND_MAX);
84
85 std::sort(begin(sortedStartPos),end(sortedStartPos));
86
87 // sample orientations
88 for (int i = 0; i < numReads; ++i)
89 if ((double)rand() / TEMP_RAND_MAX >= forwardProb)
90 sortedStartPos[i] |= REVCOMP;
91
92 if (libSize > 0)
93 {
94 resize(sortedStartPos,2*numReads);
95 // sample mate-pair positions and inverse orientations
96 for(int i=0;i<numReads;++i)
97 {
98 int leftPos = sortedStartPos[i] & ~REVCOMP;
99 int lSize = fragmentSize - (int)((2.0 * libError + 1.0) * (double)rand() / TEMP_RAND_MAX);
100 int rightPos = leftPos + lSize - readLength;
101 if ((sortedStartPos[i] & REVCOMP) == 0)
102 {
103 sortedStartPos[i+numReads] = rightPos | REVCOMP;
104 }
105 else
106 {
107 sortedStartPos[i] = rightPos | REVCOMP;
108 sortedStartPos[i+numReads] = leftPos;
109 }
110 }
111 numReads*=2;
112 }
113
114
115 }
116
117
118
119 //////////
120 // Simulates a set of reads from a set of haplotypes with a certain error distribution
121 //////////
122 template <
123 typename TReadSet,
124 typename TReadIDs,
125 typename TGenomeSet,
126 typename TDistr >
127 void simulateReads(
128 TReadSet &readSet, // generated read sequences
129 TReadIDs &readIDs, // corresponding Fasta ids
130 TGenomeSet &genomeSet, // source genome sequences
131 int numReads, // how many reads should be generated
132 int maxErrors, // how many errors they may have
133 TDistr &errorDist, // error probability distribution
134 int libSize, // library size, 0 disables mate-pair simulation
135 int libError, // maximal library size deviation
136 double forwardProb,
137 bool verbose = false)
138 {
139 typedef typename Value<TReadSet>::Type TRead;
140 typedef typename Value<TGenomeSet>::Type TGenome;
141 typedef typename Infix<TGenome>::Type TGenomeInfix;
142 typedef ModifiedString<TGenomeInfix, ModReverse> TGenomeInfixRev;
143 typedef ModifiedString<TRead, ModReverse> TReadRev;
144
145 typedef Finder<TGenomeInfix> TMyersFinder;
146 typedef Pattern<TRead, MyersUkkonen> TMyersPattern;
147
148 // typedef Finder<TGenomeInfix> TMyersFinderRev;
149 typedef Finder<TGenomeInfixRev> TMyersFinderRev;
150 typedef Pattern<TReadRev, MyersUkkonenGlobal> TMyersPatternRev;
151
152 // mtRandInit();
153 // typedef TGenome TRevComp;
154
155 int readLength = length(errorDist)/4;
156 const int REVCOMP = 1 << (sizeof(int)*8-1);
157 //int KJ = 2*maxErrors;
158
159 srand ( time(NULL) );
160
161 String<int> bucketCounter;
162 resize(bucketCounter,maxErrors,0);
163
164 String<int> kickOutCount;
165 resize(kickOutCount,maxErrors,0);
166
167 if (verbose)
168 std::cout << "\nSimulating...";
169
170 String<int> modificationPattern;
171 reserve(modificationPattern, readLength + maxErrors);
172 int inValidModPat = 0;
173
174 //at the moment: nur eine source sequenz --> nimm immer genomeSet[0]
175 TGenome& currentSource = genomeSet[0];
176 int seqLength = length(currentSource);
177
178 int fragmentSize = readLength;
179 if (libSize > 0)
180 fragmentSize = libSize + libError;
181
182 String<int> sortedStartPos;
183 /* # Pick a library size
184 currentLibrary = sample(1:(length(librarySizes)), 1)
185 lSize = round(rnorm(1, mean=librarySizes[currentLibrary], sd=librarySd[currentLibrary]))
186 if (start < end) {
187 if (start + libSize <= seqLength) {
188 startMatePair = start + libSize - readLength + 1
189 endMatePair = startMatePair + readLength - 1
190 readMate = sourceSeq[startMatePair:endMatePair]
191 readMate = reverseComplement(readMate)
192 tmp = startMatePair
193 startMatePair = endMatePair
194 endMatePair = tmp
195 invalidMatePair = 0
196 }
197 } else {
198 if (start - libSize >= 1) {
199 startMatePair = start - libSize
200 endMatePair = startMatePair + readLength - 1
201 readMate = sourceSeq[startMatePair:endMatePair]
202 invalidMatePair = 0
203 }
204 }*/
205
206 int realNumReads = numReads;
207 unsigned int samplePosCounter = 0;
208 int readCounter = 0;
209 while (readCounter < numReads) {
210 clear(modificationPattern);
211 //# Pick a haplotype
212 //currentHaplotype = sample(1:(length(haplotypes)), 1)
213
214 // Sample a read
215 if(samplePosCounter >= length(sortedStartPos))//get new start positions
216 {
217 fillupStartpos(sortedStartPos, numReads, readLength, maxErrors, libSize, libError, seqLength, forwardProb);
218 samplePosCounter = 0;
219 }
220 int startPos = sortedStartPos[samplePosCounter] & ~REVCOMP;
221 bool revComp = (sortedStartPos[samplePosCounter] & REVCOMP) != 0;
222 int maxEnd = startPos + readLength + maxErrors - 1;
223
224 TGenome read;
225 resize(read,readLength);
226
227 TGenome readTemplate;// = infix(currentSource,startPos,maxEnd);
228 resize(readTemplate,maxEnd-startPos);
229 arrayCopy(iter(currentSource,startPos), iter(currentSource,maxEnd), begin(readTemplate)); //infix(currentSource,startPos,maxEnd);
230
231 if(revComp) reverseComplement(readTemplate);
232
233 int lastOp = 0;
234 int currOp = 0;
235 // Sequence the reads
236 int countErrors = 0;
237 int pos = 0;
238 int trueLength = 0;
239 bool successful = false;
240
241 while(pos < maxEnd) {
242 lastOp = currOp;
243 double prob = (double)rand()/TEMP_RAND_MAX;
244 // std::cout << "prob = " << prob << "\t";
245 int m;
246 for(m = 0; m < 4; ++m)
247 {
248 double modProb = _transformBack(errorDist[m*readLength + trueLength]);
249 if (prob < modProb)
250 {
251 currOp = m;
252 break;
253 }
254 prob -= modProb;
255 }
256 if (m == 4) std::cout << "HUH?";
257 // std::cout << "operation = " << operation << "\t";
258 if(pos==0 && currOp == SEQAN_INSERT)// (currOp==SEQAN_DELETE || currOp == SEQAN_INSERT))
259 {
260 currOp = 0;
261 continue;
262 }
263 appendValue(modificationPattern,currOp,Generous());
264
265 // ignore reads with Ns
266 if (currOp != SEQAN_INSERT && readTemplate[pos] == 'N')
267 countErrors = maxErrors + 1;
268
269 // Insert Delete is the same as Delete Insert and both are the same as Mismatch (or match)
270 if(currOp == SEQAN_MATCH) read[trueLength] = readTemplate[pos];
271 else
272 {
273 ++countErrors;
274 if(currOp != SEQAN_INSERT)
275 read[trueLength] = sample(currOp,readTemplate[pos]);
276 }
277 if(currOp != SEQAN_INSERT) ++trueLength; //if read nucleotide is not deleted
278 if(currOp != SEQAN_DELETE) ++pos; //if read nucleotide is not an insert
279 // if((lastOp==SEQAN_DELETE && currOp==SEQAN_INSERT) || (currOp==SEQAN_DELETE && lastOp==SEQAN_INSERT))
280 // {
281 // --countErrors;
282 // currOp = SEQAN_MISMATCH;
283 // // std::cout << "ID=DI=M\n";
284 // }
285 // std::cout << "true len = " << trueLength << std::endl;
286 if(trueLength == readLength || countErrors >= maxErrors)
287 {
288 if(countErrors < maxErrors)// && currOp != SEQAN_INSERT)
289 successful = true;
290 break;
291 }
292 }
293
294 /*
295 if(successful)
296 {
297 int patLen = length(modificationPattern);
298 int err = 0, del = 0;
299 for (int j = 0; j < KJ; ++j)
300 {
301 switch (modificationPattern[j]) {
302 case SEQAN_MATCH:
303 ++del;
304 break;
305
306 case SEQAN_DELETE:
307 ++del;
308
309 case SEQAN_INSERT:
310 ++err;
311 break;
312
313 default:;
314 }
315 if (del > 0 && del <= err)
316 {
317 successful = false;
318 ++inValidModPat;
319 break;
320 }
321 }
322 if(successful)
323 {
324 err = del = 0;
325 for (int j = patLen - 1; j >= patLen - KJ; --j)
326 {
327 switch (modificationPattern[j]) {
328 case SEQAN_MATCH:
329 ++del;
330 break;
331
332 case SEQAN_DELETE:
333 ++del;
334
335 case SEQAN_INSERT:
336 ++err;
337 break;
338
339 default:;
340 }
341 if (del > 0 && del <= err)
342 {
343 successful = false;
344 ++inValidModPat;
345 break;
346 }
347 }
348 }
349 }
350 */
351
352 /* countMateErrors = 0
353 if (simulateMatePairs == 1) {
354 for(pos in 1:length(readMate)) {
355 if (runif(1) <= _transformBack(errorDist[pos])) {
356 readMate[pos] = sample(alphabet[alphabet!= readMate[pos]], 1)
357 countMateErrors = countMateErrors + 1
358 }
359 }
360 }*/
361
362 if(successful)
363 {
364 //verify that number of errors is correct
365 bool kickOut = false;
366 /* int start1 = startPos;
367 int maxEnd1 = maxEnd;
368 while(start1 > 0 && (startPos - start1) < countErrors) --start1;
369 while(maxEnd1 > 0 && (maxEnd1 - maxEnd) < countErrors) ++maxEnd1;
370 TGenomeInfix genomeInfix(currentSource,start1,maxEnd1);
371 TMyersFinder myersFinder(genomeInfix);
372
373 // init forward verifiers
374 if(revComp) reverseComplement(read);
375 TMyersPattern forwardPattern(read);
376 TMyersPattern &myersPattern = forwardPattern;
377
378 // find end of best semi-global alignment
379 int maxScore = MinValue<int>::VALUE;
380 int minScore = -(int)countErrors;
381 TMyersFinder maxPos;
382 while (find(myersFinder, myersPattern, minScore))
383 if (maxScore < getScore(myersPattern))
384 {
385 maxScore = getScore(myersPattern);
386 maxPos = myersFinder;
387 }
388
389 if (maxScore >= minScore)
390 {
391 TGenomeInfixRev infRev(infix(currentSource, start1, start1+position(maxPos)+1));
392 TReadRev readRev(read);
393 TMyersFinderRev myersFinderRev(infRev);
394 TMyersPatternRev myersPatternRev(readRev);
395 // find beginning of best semi-global alignment
396 if (find(myersFinderRev, myersPatternRev, maxScore))
397 start1 = start1 + position(maxPos) - (position(myersFinderRev) + 1);
398 else {
399 // this case should never occur
400 if(revComp) std::cerr <<"reverse\n";
401 std::cerr << "posMaxpos = " << position(maxPos) << std::endl;
402 std::cerr << "startPos = " << startPos << std::endl;
403 std::cerr << "maxEnd = " << maxEnd << std::endl;
404 std::cerr << "HUH?\n" << std::endl;
405 std::cerr << "fGENOME: " << host(myersFinder) << std::endl;
406 std::cerr << "fREAD: " << read << std::endl;
407 std::cerr << "iGENOME: " << infix(currentSource, start1,start1+position(maxPos)+1) << std::endl;
408 std::cerr << "rGENOME: " << infRev << std::endl;
409 std::cerr << "rREAD: " << readRev << std::endl;
410 }
411
412 }
413 if(revComp) reverseComplement(read);
414 SEQAN_ASSERT_TRUE(maxScore >= -(int)countErrors);
415 if(maxScore != -(int)countErrors)
416 kickOut = true;*/
417 if(!kickOut)
418 {
419 std::stringstream id;
420 resize(read,trueLength);
421 ++bucketCounter[countErrors];
422 ++readCounter;
423 if(verbose && readCounter%10000 == 0)std::cout << readCounter<<"..." << std::flush;
424 //Add read to readSet
425 if(!revComp) id << startPos << ',' << startPos+pos;
426 else id << maxEnd << ',' << maxEnd-pos;
427 id << "[id=" << readCounter << ",fragId=" << readCounter % realNumReads;
428 id << ",repeatId=" << 0 <<",errors=" << countErrors;
429 if (revComp) id << ",orientation=R]";
430 else id << ",orientation=F]";
431
432 appendValue(readIDs, id.str(),Generous());
433 appendValue(readSet, read, Generous());
434 }
435 else ++kickOutCount[countErrors];
436
437 }
438 ++samplePosCounter;
439 // else std::cout << "Not successful\n";
440 }
441 // if (simulateMatePairs == 1) {
442 // print(bucketCounter / (2* numOfReads))
443 // } else {
444 if (verbose)
445 {
446 std::cout << "\n\nBucket frequencies:\n";
447 for(unsigned i = 0; i < length(bucketCounter); ++i)
448 std::cout << (double) bucketCounter[i] / numReads << std::endl;
449 std::cout << std::endl;
450 std::cout << "\nBucket kickout count:\n";
451 for(unsigned i = 0; i < length(kickOutCount); ++i)
452 {
453 if((kickOutCount[i] + bucketCounter[i]) > 0) std::cout << (double) kickOutCount[i] / (kickOutCount[i] + bucketCounter[i]) << std::endl;
454 else std::cout << "0\n";
455 }
456 std::cout << std::endl;
457
458 std::cout << "\nInvalid modification pattern count: "<<inValidModPat<<std::endl;
459 }
460
461 // if (simulateMatePairs == 1) {
462 // write(scan(tmpPath, what = 'character'), file=readPath, sep=std::endl, append = TRUE)
463 // unlink(tmpPath)
464 // }
465 }
466
467 }
468
469 #endif
470