1 /*==========================================================================
2 RazerS - Fast Read Mapping with Controlled Loss Rate
3 http://www.seqan.de/projects/razers.html
4
5 ============================================================================
6 Copyright (C) 2008 by Anne-Katrin Emde
7
8 This program is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public
10 License as published by the Free Software Foundation; either
11 version 3 of the License, or (at your options) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ==========================================================================*/
21
22 #ifndef SEQAN_HEADER_READ_SIMULATOR_H
23 #define SEQAN_HEADER_READ_SIMULATOR_H
24
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28 #include <algorithm>
29
30 #include <seqan/sequence.h>
31 #include <seqan/find.h>
32 #include <seqan/modifier.h>
33
34
35 namespace seqan
36 {
37
38 #define TEMP_RAND_MAX (RAND_MAX + 1.0)
39
40 template <typename TOperation, typename TAlphabet>
41 inline TAlphabet
sample(TOperation m,TAlphabet base)42 sample(TOperation m, TAlphabet base)
43 {
44 Dna ret;
45 do
46 ret = (Dna)(int)((double)ValueSize<Dna>::VALUE * (double)rand() / TEMP_RAND_MAX);
47 while (ret == base && m == SEQAN_MISMATCH);
48 return ret;
49 }
50
51 template < typename TGenome >
simulateGenome(TGenome & genome,int size)52 void simulateGenome(TGenome &genome, int size)
53 {
54 // mtRandInit();
55 resize(genome, size);
56 for(int i = 0; i < size; ++i)
57 genome[i] = sample(0, (Dna)0);
58 }
59
60
61 template<typename TPosString>
62 void
fillupStartpos(TPosString & sortedStartPos,int numReads,int readLength,int maxErrors,int libSize,int libError,int seqLength,double forwardProb)63 fillupStartpos(TPosString & sortedStartPos,
64 int numReads,
65 int readLength,
66 int maxErrors, // how many errors they may have
67 int libSize, // library size, 0 disables mate-pair simulation
68 int libError,
69 int seqLength, // maximal library size deviation
70 double forwardProb)
71 {
72
73 const int REVCOMP = 1 << (sizeof(int)*8-1);
74 resize(sortedStartPos, numReads);
75
76 int fragmentSize = readLength;
77 if (libSize > 0)
78 fragmentSize = libSize + libError;
79
80
81 // sample positions
82 for (int i = 0; i < numReads; ++i)
83 sortedStartPos[i] = (int)((seqLength - fragmentSize - maxErrors + 1.0) * rand() / TEMP_RAND_MAX);
84
85 std::sort(begin(sortedStartPos),end(sortedStartPos));
86
87 // sample orientations
88 for (int i = 0; i < numReads; ++i)
89 if ((double)rand() / TEMP_RAND_MAX >= forwardProb)
90 sortedStartPos[i] |= REVCOMP;
91
92 if (libSize > 0)
93 {
94 resize(sortedStartPos,2*numReads);
95 // sample mate-pair positions and inverse orientations
96 for(int i=0;i<numReads;++i)
97 {
98 int leftPos = sortedStartPos[i] & ~REVCOMP;
99 int lSize = fragmentSize - (int)((2.0 * libError + 1.0) * (double)rand() / TEMP_RAND_MAX);
100 int rightPos = leftPos + lSize - readLength;
101 if ((sortedStartPos[i] & REVCOMP) == 0)
102 {
103 sortedStartPos[i+numReads] = rightPos | REVCOMP;
104 }
105 else
106 {
107 sortedStartPos[i] = rightPos | REVCOMP;
108 sortedStartPos[i+numReads] = leftPos;
109 }
110 }
111 numReads*=2;
112 }
113
114
115 }
116
117
118
119 //////////
120 // Simulates a set of reads from a set of haplotypes with a certain error distribution
121 //////////
122 template <
123 typename TReadSet,
124 typename TReadIDs,
125 typename TGenomeSet,
126 typename TDistr >
127 void simulateReads(
128 TReadSet &readSet, // generated read sequences
129 TReadIDs &readIDs, // corresponding Fasta ids
130 TGenomeSet &genomeSet, // source genome sequences
131 int numReads, // how many reads should be generated
132 int maxErrors, // how many errors they may have
133 TDistr &errorDist, // error probability distribution
134 int libSize, // library size, 0 disables mate-pair simulation
135 int libError, // maximal library size deviation
136 double forwardProb,
137 bool verbose = false)
138 {
139 //typedef typename Value<TReadSet>::Type TRead;
140 //typedef typename Value<TGenomeSet>::Type TGenome;
141 //typedef typename Infix<TGenome>::Type TGenomeInfix;
142 //typedef ModifiedString<TGenomeInfix, ModReverse> TGenomeInfixRev;
143 //typedef ModifiedString<TRead, ModReverse> TReadRev;
144
145 //typedef Finder<TGenomeInfix> TMyersFinder;
146 //typedef Pattern<TRead, MyersUkkonen> TMyersPattern;
147
148 // typedef Finder<TGenomeInfix> TMyersFinderRev;
149 //typedef Finder<TGenomeInfixRev> TMyersFinderRev;
150 //typedef Pattern<TReadRev, MyersUkkonenGlobal> TMyersPatternRev;
151
152 // mtRandInit();
153 // typedef TGenome TRevComp;
154
155 int readLength = length(errorDist)/4;
156 const int REVCOMP = 1 << (sizeof(int)*8-1);
157 //int KJ = 2*maxErrors;
158
159 srand ( time(NULL) );
160
161 String<int> bucketCounter;
162 resize(bucketCounter,maxErrors,0);
163
164 String<int> kickOutCount;
165 resize(kickOutCount,maxErrors,0);
166
167 if (verbose)
168 std::cout << "\nSimulating...";
169
170 String<int> modificationPattern;
171 reserve(modificationPattern, readLength + maxErrors);
172 int inValidModPat = 0;
173
174 //at the moment: nur eine source sequenz --> nimm immer genomeSet[0]
175 TGenome& currentSource = genomeSet[0];
176 int seqLength = length(currentSource);
177
178 int fragmentSize = readLength;
179 if (libSize > 0)
180 fragmentSize = libSize + libError;
181
182 String<int> sortedStartPos;
183 /* # Pick a library size
184 currentLibrary = sample(1:(length(librarySizes)), 1)
185 lSize = round(rnorm(1, mean=librarySizes[currentLibrary], sd=librarySd[currentLibrary]))
186 if (start < end) {
187 if (start + libSize <= seqLength) {
188 startMatePair = start + libSize - readLength + 1
189 endMatePair = startMatePair + readLength - 1
190 readMate = sourceSeq[startMatePair:endMatePair]
191 readMate = reverseComplement(readMate)
192 tmp = startMatePair
193 startMatePair = endMatePair
194 endMatePair = tmp
195 invalidMatePair = 0
196 }
197 } else {
198 if (start - libSize >= 1) {
199 startMatePair = start - libSize
200 endMatePair = startMatePair + readLength - 1
201 readMate = sourceSeq[startMatePair:endMatePair]
202 invalidMatePair = 0
203 }
204 }*/
205
206 int realNumReads = numReads;
207 unsigned int samplePosCounter = 0;
208 int readCounter = 0;
209 while (readCounter < numReads) {
210 clear(modificationPattern);
211 //# Pick a haplotype
212 //currentHaplotype = sample(1:(length(haplotypes)), 1)
213
214 // Sample a read
215 if(samplePosCounter >= length(sortedStartPos))//get new start positions
216 {
217 fillupStartpos(sortedStartPos, numReads, readLength, maxErrors, libSize, libError, seqLength, forwardProb);
218 samplePosCounter = 0;
219 }
220 int startPos = sortedStartPos[samplePosCounter] & ~REVCOMP;
221 bool revComp = (sortedStartPos[samplePosCounter] & REVCOMP) != 0;
222 int maxEnd = startPos + readLength + maxErrors - 1;
223
224 TGenome read;
225 resize(read,readLength);
226
227 TGenome readTemplate;// = infix(currentSource,startPos,maxEnd);
228 resize(readTemplate,maxEnd-startPos);
229 arrayCopy(iter(currentSource,startPos), iter(currentSource,maxEnd), begin(readTemplate)); //infix(currentSource,startPos,maxEnd);
230
231 if(revComp) reverseComplement(readTemplate);
232
233 int lastOp = 0;
234 int currOp = 0;
235 // Sequence the reads
236 int countErrors = 0;
237 int pos = 0;
238 int trueLength = 0;
239 bool successful = false;
240
241 while(pos < maxEnd) {
242 lastOp = currOp;
243 double prob = (double)rand()/TEMP_RAND_MAX;
244 // std::cout << "prob = " << prob << "\t";
245 int m;
246 for(m = 0; m < 4; ++m)
247 {
248 double modProb = _transformBack(errorDist[m*readLength + trueLength]);
249 if (prob < modProb)
250 {
251 currOp = m;
252 break;
253 }
254 prob -= modProb;
255 }
256 if (m == 4) std::cout << "HUH?";
257 // std::cout << "operation = " << operation << "\t";
258 if(pos==0 && currOp == SEQAN_INSERT)// (currOp==SEQAN_DELETE || currOp == SEQAN_INSERT))
259 {
260 currOp = 0;
261 continue;
262 }
263 appendValue(modificationPattern,currOp,Generous());
264
265 // ignore reads with Ns
266 if (currOp != SEQAN_INSERT && readTemplate[pos] == 'N')
267 countErrors = maxErrors + 1;
268
269 // Insert Delete is the same as Delete Insert and both are the same as Mismatch (or match)
270 if(currOp == SEQAN_MATCH) read[trueLength] = readTemplate[pos];
271 else
272 {
273 ++countErrors;
274 if(currOp != SEQAN_INSERT)
275 read[trueLength] = sample(currOp,readTemplate[pos]);
276 }
277 if(currOp != SEQAN_INSERT) ++trueLength; //if read nucleotide is not deleted
278 if(currOp != SEQAN_DELETE) ++pos; //if read nucleotide is not an insert
279 // if((lastOp==SEQAN_DELETE && currOp==SEQAN_INSERT) || (currOp==SEQAN_DELETE && lastOp==SEQAN_INSERT))
280 // {
281 // --countErrors;
282 // currOp = SEQAN_MISMATCH;
283 // // std::cout << "ID=DI=M\n";
284 // }
285 // std::cout << "true len = " << trueLength << std::endl;
286 if(trueLength == readLength || countErrors >= maxErrors)
287 {
288 if(countErrors < maxErrors)// && currOp != SEQAN_INSERT)
289 successful = true;
290 break;
291 }
292 }
293
294 /*
295 if(successful)
296 {
297 int patLen = length(modificationPattern);
298 int err = 0, del = 0;
299 for (int j = 0; j < KJ; ++j)
300 {
301 switch (modificationPattern[j]) {
302 case SEQAN_MATCH:
303 ++del;
304 break;
305
306 case SEQAN_DELETE:
307 ++del;
308 SEQAN_FALLTHROUGH
309
310 case SEQAN_INSERT:
311 ++err;
312 break;
313
314 default:;
315 }
316 if (del > 0 && del <= err)
317 {
318 successful = false;
319 ++inValidModPat;
320 break;
321 }
322 }
323 if(successful)
324 {
325 err = del = 0;
326 for (int j = patLen - 1; j >= patLen - KJ; --j)
327 {
328 switch (modificationPattern[j]) {
329 case SEQAN_MATCH:
330 ++del;
331 break;
332
333 case SEQAN_DELETE:
334 ++del;
335 SEQAN_FALLTHROUGH
336
337 case SEQAN_INSERT:
338 ++err;
339 break;
340
341 default:;
342 }
343 if (del > 0 && del <= err)
344 {
345 successful = false;
346 ++inValidModPat;
347 break;
348 }
349 }
350 }
351 }
352 */
353
354 /* countMateErrors = 0
355 if (simulateMatePairs == 1) {
356 for(pos in 1:length(readMate)) {
357 if (runif(1) <= _transformBack(errorDist[pos])) {
358 readMate[pos] = sample(alphabet[alphabet!= readMate[pos]], 1)
359 countMateErrors = countMateErrors + 1
360 }
361 }
362 }*/
363
364 if(successful)
365 {
366 //verify that number of errors is correct
367 bool kickOut = false;
368 /* int start1 = startPos;
369 int maxEnd1 = maxEnd;
370 while(start1 > 0 && (startPos - start1) < countErrors) --start1;
371 while(maxEnd1 > 0 && (maxEnd1 - maxEnd) < countErrors) ++maxEnd1;
372 TGenomeInfix genomeInfix(currentSource,start1,maxEnd1);
373 TMyersFinder myersFinder(genomeInfix);
374
375 // init forward verifiers
376 if(revComp) reverseComplement(read);
377 TMyersPattern forwardPattern(read);
378 TMyersPattern &myersPattern = forwardPattern;
379
380 // find end of best semi-global alignment
381 int maxScore = std::numeric_limits<int>::min();
382 int minScore = -(int)countErrors;
383 TMyersFinder maxPos;
384 while (find(myersFinder, myersPattern, minScore))
385 if (maxScore < getScore(myersPattern))
386 {
387 maxScore = getScore(myersPattern);
388 maxPos = myersFinder;
389 }
390
391 if (maxScore >= minScore)
392 {
393 TGenomeInfixRev infRev(infix(currentSource, start1, start1+position(maxPos)+1));
394 TReadRev readRev(read);
395 TMyersFinderRev myersFinderRev(infRev);
396 TMyersPatternRev myersPatternRev(readRev);
397 // find beginning of best semi-global alignment
398 if (find(myersFinderRev, myersPatternRev, maxScore))
399 start1 = start1 + position(maxPos) - (position(myersFinderRev) + 1);
400 else {
401 // this case should never occur
402 if(revComp) std::cerr <<"reverse\n";
403 std::cerr << "posMaxpos = " << position(maxPos) << std::endl;
404 std::cerr << "startPos = " << startPos << std::endl;
405 std::cerr << "maxEnd = " << maxEnd << std::endl;
406 std::cerr << "HUH?\n" << std::endl;
407 std::cerr << "fGENOME: " << host(myersFinder) << std::endl;
408 std::cerr << "fREAD: " << read << std::endl;
409 std::cerr << "iGENOME: " << infix(currentSource, start1,start1+position(maxPos)+1) << std::endl;
410 std::cerr << "rGENOME: " << infRev << std::endl;
411 std::cerr << "rREAD: " << readRev << std::endl;
412 }
413
414 }
415 if(revComp) reverseComplement(read);
416 SEQAN_ASSERT(maxScore >= -(int)countErrors);
417 if(maxScore != -(int)countErrors)
418 kickOut = true;*/
419 if(!kickOut)
420 {
421 std::stringstream id;
422 resize(read,trueLength);
423 ++bucketCounter[countErrors];
424 ++readCounter;
425 if(verbose && readCounter%10000 == 0)std::cout << readCounter<<"..." << std::flush;
426 //Add read to readSet
427 if(!revComp) id << startPos << ',' << startPos+pos;
428 else id << maxEnd << ',' << maxEnd-pos;
429 id << "[id=" << readCounter << ",fragId=" << readCounter % realNumReads;
430 id << ",repeatId=" << 0 <<",errors=" << countErrors;
431 if (revComp) id << ",orientation=R]";
432 else id << ",orientation=F]";
433
434 appendValue(readIDs, id.str(),Generous());
435 appendValue(readSet, read, Generous());
436 }
437 else ++kickOutCount[countErrors];
438
439 }
440 ++samplePosCounter;
441 // else std::cout << "Not successful\n";
442 }
443 // if (simulateMatePairs == 1) {
444 // print(bucketCounter / (2* numOfReads))
445 // } else {
446 if (verbose)
447 {
448 std::cout << "\n\nBucket frequencies:\n";
449 for(unsigned i = 0; i < length(bucketCounter); ++i)
450 std::cout << (double) bucketCounter[i] / numReads << std::endl;
451 std::cout << std::endl;
452 std::cout << "\nBucket kickout count:\n";
453 for(unsigned i = 0; i < length(kickOutCount); ++i)
454 {
455 if((kickOutCount[i] + bucketCounter[i]) > 0) std::cout << (double) kickOutCount[i] / (kickOutCount[i] + bucketCounter[i]) << std::endl;
456 else std::cout << "0\n";
457 }
458 std::cout << std::endl;
459
460 std::cout << "\nInvalid modification pattern count: "<<inValidModPat<<std::endl;
461 }
462
463 // if (simulateMatePairs == 1) {
464 // write(scan(tmpPath, what = 'character'), file=readPath, sep=std::endl, append = TRUE)
465 // unlink(tmpPath)
466 // }
467 }
468
469 }
470
471 #endif
472