1 /*==========================================================================
2 RazerS - Fast Read Mapping with Controlled Loss Rate
3 http://www.seqan.de/projects/razers.html
4
5 ============================================================================
6 Copyright (C) 2008 by Anne-Katrin Emde
7
8 This program is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public
10 License as published by the Free Software Foundation; either
11 version 3 of the License, or (at your options) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ==========================================================================*/
21
22 #ifndef SEQAN_HEADER_READ_SIMULATOR_H
23 #define SEQAN_HEADER_READ_SIMULATOR_H
24
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28 #include <algorithm>
29 #include <random>
30
31 #include <seqan/sequence.h>
32 #include <seqan/find.h>
33 #include <seqan/modifier.h>
34
35
36 namespace seqan
37 {
38
39 template <typename TOperation, typename TAlphabet>
40 inline TAlphabet
sample(std::mt19937 & rng,TOperation m,TAlphabet base)41 sample(std::mt19937 & rng, TOperation m, TAlphabet base)
42 {
43 std::uniform_int_distribution<int> dist(0, 2);
44
45 int ret = Dna(dist(rng));
46 if (m == SEQAN_MISMATCH && ret >= ordValue(base))
47 ret += 1;
48 return Dna(ret);
49 }
50
51 template < typename TGenome >
simulateGenome(std::mt19937 & rng,TGenome & genome,int size)52 void simulateGenome(std::mt19937 & rng,
53 TGenome &genome, int size)
54 {
55 // mtRandInit();
56 resize(genome, size);
57 for(int i = 0; i < size; ++i)
58 genome[i] = sample(rng, 0, (Dna)0);
59 }
60
61
62 template<typename TPosString>
63 void
fillupStartpos(std::mt19937 & rng,TPosString & sortedStartPos,int numReads,int readLength,int maxErrors,int libSize,int libError,int seqLength,double forwardProb)64 fillupStartpos(
65 std::mt19937 & rng,
66 TPosString & sortedStartPos,
67 int numReads,
68 int readLength,
69 int maxErrors, // how many errors they may have
70 int libSize, // library size, 0 disables mate-pair simulation
71 int libError,
72 int seqLength, // maximal library size deviation
73 double forwardProb)
74 {
75
76 const int REVCOMP = 1 << (sizeof(int)*8-1);
77 resize(sortedStartPos, numReads);
78
79 int fragmentSize = readLength;
80 if (libSize > 0)
81 fragmentSize = libSize + libError;
82
83
84 // sample positions
85 std::uniform_real_distribution<double> dist(0, 1);
86 for (int i = 0; i < numReads; ++i)
87 sortedStartPos[i] = (int)((seqLength - fragmentSize - maxErrors + 1.0) * dist(rng));
88
89 std::sort(begin(sortedStartPos),end(sortedStartPos));
90
91 // sample orientations
92 for (int i = 0; i < numReads; ++i)
93 if (dist(rng) >= forwardProb)
94 sortedStartPos[i] |= REVCOMP;
95
96 if (libSize > 0)
97 {
98 resize(sortedStartPos,2*numReads);
99 // sample mate-pair positions and inverse orientations
100 for(int i=0;i<numReads;++i)
101 {
102 int leftPos = sortedStartPos[i] & ~REVCOMP;
103 int lSize = fragmentSize - (int)((2.0 * libError + 1.0) * dist(rng));
104 int rightPos = leftPos + lSize - readLength;
105 if ((sortedStartPos[i] & REVCOMP) == 0)
106 {
107 sortedStartPos[i+numReads] = rightPos | REVCOMP;
108 }
109 else
110 {
111 sortedStartPos[i] = rightPos | REVCOMP;
112 sortedStartPos[i+numReads] = leftPos;
113 }
114 }
115 numReads*=2;
116 }
117 }
118
119
120
121 //////////
122 // Simulates a set of reads from a set of haplotypes with a certain error distribution
123 //////////
124 template <
125 typename TReadSet,
126 typename TReadIDs,
127 typename TGenomeSet,
128 typename TDistr >
129 void simulateReads(
130 std::mt19937 & rng,
131 TReadSet &readSet, // generated read sequences
132 TReadIDs &readIDs, // corresponding Fasta ids
133 TGenomeSet &genomeSet, // source genome sequences
134 int numReads, // how many reads should be generated
135 int maxErrors, // how many errors they may have
136 TDistr &errorDist, // error probability distribution
137 int libSize, // library size, 0 disables mate-pair simulation
138 int libError, // maximal library size deviation
139 double forwardProb,
140 bool verbose = false)
141 {
142 //typedef typename Value<TReadSet>::Type TRead;
143 //typedef typename Value<TGenomeSet>::Type TGenome;
144 //typedef typename Infix<TGenome>::Type TGenomeInfix;
145 //typedef ModifiedString<TGenomeInfix, ModReverse> TGenomeInfixRev;
146 //typedef ModifiedString<TRead, ModReverse> TReadRev;
147
148 //typedef Finder<TGenomeInfix> TMyersFinder;
149 //typedef Pattern<TRead, MyersUkkonen> TMyersPattern;
150
151 // typedef Finder<TGenomeInfix> TMyersFinderRev;
152 //typedef Finder<TGenomeInfixRev> TMyersFinderRev;
153 //typedef Pattern<TReadRev, MyersUkkonenGlobal> TMyersPatternRev;
154
155 // mtRandInit();
156 // typedef TGenome TRevComp;
157
158 int readLength = length(errorDist)/4;
159 const int REVCOMP = 1 << (sizeof(int)*8-1);
160 //int KJ = 2*maxErrors;
161
162 String<int> bucketCounter;
163 resize(bucketCounter,maxErrors,0);
164
165 String<int> kickOutCount;
166 resize(kickOutCount,maxErrors,0);
167
168 if (verbose)
169 std::cout << "\nSimulating...";
170
171 String<int> modificationPattern;
172 reserve(modificationPattern, readLength + maxErrors);
173 int inValidModPat = 0;
174
175 //at the moment: nur eine source sequenz --> nimm immer genomeSet[0]
176 TGenome& currentSource = genomeSet[0];
177 int seqLength = length(currentSource);
178
179 // int fragmentSize = readLength;
180 // if (libSize > 0)
181 // fragmentSize = libSize + libError;
182
183 String<int> sortedStartPos;
184 /* # Pick a library size
185 currentLibrary = sample(1:(length(librarySizes)), 1)
186 lSize = round(rnorm(1, mean=librarySizes[currentLibrary], sd=librarySd[currentLibrary]))
187 if (start < end) {
188 if (start + libSize <= seqLength) {
189 startMatePair = start + libSize - readLength + 1
190 endMatePair = startMatePair + readLength - 1
191 readMate = sourceSeq[startMatePair:endMatePair]
192 readMate = reverseComplement(readMate)
193 tmp = startMatePair
194 startMatePair = endMatePair
195 endMatePair = tmp
196 invalidMatePair = 0
197 }
198 } else {
199 if (start - libSize >= 1) {
200 startMatePair = start - libSize
201 endMatePair = startMatePair + readLength - 1
202 readMate = sourceSeq[startMatePair:endMatePair]
203 invalidMatePair = 0
204 }
205 }*/
206
207 int realNumReads = numReads;
208 unsigned int samplePosCounter = 0;
209 int readCounter = 0;
210 while (readCounter < numReads) {
211 clear(modificationPattern);
212 //# Pick a haplotype
213 //currentHaplotype = sample(1:(length(haplotypes)), 1)
214
215 // Sample a read
216 if(samplePosCounter >= length(sortedStartPos))//get new start positions
217 {
218 fillupStartpos(rng, sortedStartPos, numReads, readLength, maxErrors, libSize, libError, seqLength, forwardProb);
219 samplePosCounter = 0;
220 }
221 int startPos = sortedStartPos[samplePosCounter] & ~REVCOMP;
222 bool revComp = (sortedStartPos[samplePosCounter] & REVCOMP) != 0;
223 int maxEnd = startPos + readLength + maxErrors - 1;
224
225 TGenome read;
226 resize(read,readLength);
227
228 TGenome readTemplate;// = infix(currentSource,startPos,maxEnd);
229 resize(readTemplate,maxEnd-startPos);
230 arrayCopy(iter(currentSource,startPos), iter(currentSource,maxEnd), begin(readTemplate)); //infix(currentSource,startPos,maxEnd);
231
232 if(revComp) reverseComplement(readTemplate);
233
234 // int lastOp = 0;
235 int currOp = 0;
236 // Sequence the reads
237 int countErrors = 0;
238 int pos = 0;
239 int trueLength = 0;
240 bool successful = false;
241
242 std::uniform_real_distribution<double> dist(0, 1);
243
244 while(pos < maxEnd) {
245 // lastOp = currOp;
246 double prob = dist(rng);
247 // std::cout << "prob = " << prob << "\t";
248 int m;
249 for(m = 0; m < 4; ++m)
250 {
251 double modProb = _transformBack(errorDist[m*readLength + trueLength]);
252 if (prob < modProb)
253 {
254 currOp = m;
255 break;
256 }
257 prob -= modProb;
258 }
259 if (m == 4) std::cout << "HUH?";
260 // std::cout << "operation = " << operation << "\t";
261 if(pos==0 && currOp == SEQAN_INSERT)// (currOp==SEQAN_DELETE || currOp == SEQAN_INSERT))
262 {
263 currOp = 0;
264 continue;
265 }
266 appendValue(modificationPattern,currOp,Generous());
267
268 // ignore reads with Ns
269 if (currOp != SEQAN_INSERT && readTemplate[pos] == 'N')
270 countErrors = maxErrors + 1;
271
272 // Insert Delete is the same as Delete Insert and both are the same as Mismatch (or match)
273 if(currOp == SEQAN_MATCH) read[trueLength] = readTemplate[pos];
274 else
275 {
276 ++countErrors;
277 if(currOp != SEQAN_INSERT)
278 read[trueLength] = sample(rng, currOp,readTemplate[pos]);
279 }
280 if(currOp != SEQAN_INSERT) ++trueLength; //if read nucleotide is not deleted
281 if(currOp != SEQAN_DELETE) ++pos; //if read nucleotide is not an insert
282 // if((lastOp==SEQAN_DELETE && currOp==SEQAN_INSERT) || (currOp==SEQAN_DELETE && lastOp==SEQAN_INSERT))
283 // {
284 // --countErrors;
285 // currOp = SEQAN_MISMATCH;
286 // // std::cout << "ID=DI=M\n";
287 // }
288 // std::cout << "true len = " << trueLength << std::endl;
289 if(trueLength == readLength || countErrors >= maxErrors)
290 {
291 if(countErrors < maxErrors)// && currOp != SEQAN_INSERT)
292 successful = true;
293 break;
294 }
295 }
296
297 /*
298 if(successful)
299 {
300 int patLen = length(modificationPattern);
301 int err = 0, del = 0;
302 for (int j = 0; j < KJ; ++j)
303 {
304 switch (modificationPattern[j]) {
305 case SEQAN_MATCH:
306 ++del;
307 break;
308
309 case SEQAN_DELETE:
310 ++del;
311 SEQAN_FALLTHROUGH
312
313 case SEQAN_INSERT:
314 ++err;
315 break;
316
317 default:;
318 }
319 if (del > 0 && del <= err)
320 {
321 successful = false;
322 ++inValidModPat;
323 break;
324 }
325 }
326 if(successful)
327 {
328 err = del = 0;
329 for (int j = patLen - 1; j >= patLen - KJ; --j)
330 {
331 switch (modificationPattern[j]) {
332 case SEQAN_MATCH:
333 ++del;
334 break;
335
336 case SEQAN_DELETE:
337 ++del;
338 SEQAN_FALLTHROUGH
339
340 case SEQAN_INSERT:
341 ++err;
342 break;
343
344 default:;
345 }
346 if (del > 0 && del <= err)
347 {
348 successful = false;
349 ++inValidModPat;
350 break;
351 }
352 }
353 }
354 }
355 */
356
357 /* countMateErrors = 0
358 if (simulateMatePairs == 1) {
359 for(pos in 1:length(readMate)) {
360 if (runif(1) <= _transformBack(errorDist[pos])) {
361 readMate[pos] = sample(alphabet[alphabet!= readMate[pos]], 1)
362 countMateErrors = countMateErrors + 1
363 }
364 }
365 }*/
366
367 if(successful)
368 {
369 //verify that number of errors is correct
370 bool kickOut = false;
371 /* int start1 = startPos;
372 int maxEnd1 = maxEnd;
373 while(start1 > 0 && (startPos - start1) < countErrors) --start1;
374 while(maxEnd1 > 0 && (maxEnd1 - maxEnd) < countErrors) ++maxEnd1;
375 TGenomeInfix genomeInfix(currentSource,start1,maxEnd1);
376 TMyersFinder myersFinder(genomeInfix);
377
378 // init forward verifiers
379 if(revComp) reverseComplement(read);
380 TMyersPattern forwardPattern(read);
381 TMyersPattern &myersPattern = forwardPattern;
382
383 // find end of best semi-global alignment
384 int maxScore = std::numeric_limits<int>::min();
385 int minScore = -(int)countErrors;
386 TMyersFinder maxPos;
387 while (find(myersFinder, myersPattern, minScore))
388 if (maxScore < getScore(myersPattern))
389 {
390 maxScore = getScore(myersPattern);
391 maxPos = myersFinder;
392 }
393
394 if (maxScore >= minScore)
395 {
396 TGenomeInfixRev infRev(infix(currentSource, start1, start1+position(maxPos)+1));
397 TReadRev readRev(read);
398 TMyersFinderRev myersFinderRev(infRev);
399 TMyersPatternRev myersPatternRev(readRev);
400 // find beginning of best semi-global alignment
401 if (find(myersFinderRev, myersPatternRev, maxScore))
402 start1 = start1 + position(maxPos) - (position(myersFinderRev) + 1);
403 else {
404 // this case should never occur
405 if(revComp) std::cerr <<"reverse\n";
406 std::cerr << "posMaxpos = " << position(maxPos) << std::endl;
407 std::cerr << "startPos = " << startPos << std::endl;
408 std::cerr << "maxEnd = " << maxEnd << std::endl;
409 std::cerr << "HUH?\n" << std::endl;
410 std::cerr << "fGENOME: " << host(myersFinder) << std::endl;
411 std::cerr << "fREAD: " << read << std::endl;
412 std::cerr << "iGENOME: " << infix(currentSource, start1,start1+position(maxPos)+1) << std::endl;
413 std::cerr << "rGENOME: " << infRev << std::endl;
414 std::cerr << "rREAD: " << readRev << std::endl;
415 }
416
417 }
418 if(revComp) reverseComplement(read);
419 SEQAN_ASSERT(maxScore >= -(int)countErrors);
420 if(maxScore != -(int)countErrors)
421 kickOut = true;*/
422 if(!kickOut)
423 {
424 std::stringstream id;
425 resize(read,trueLength);
426 ++bucketCounter[countErrors];
427 ++readCounter;
428 if(verbose && readCounter%10000 == 0)std::cout << readCounter<<"..." << std::flush;
429 //Add read to readSet
430 if(!revComp) id << startPos << ',' << startPos+pos;
431 else id << maxEnd << ',' << maxEnd-pos;
432 id << "[id=" << readCounter << ",fragId=" << readCounter % realNumReads;
433 id << ",repeatId=" << 0 <<",errors=" << countErrors;
434 if (revComp) id << ",orientation=R]";
435 else id << ",orientation=F]";
436
437 appendValue(readIDs, id.str(),Generous());
438 appendValue(readSet, read, Generous());
439 }
440 else ++kickOutCount[countErrors];
441
442 }
443 ++samplePosCounter;
444 // else std::cout << "Not successful\n";
445 }
446 // if (simulateMatePairs == 1) {
447 // print(bucketCounter / (2* numOfReads))
448 // } else {
449 if (verbose)
450 {
451 std::cout << "\n\nBucket frequencies:\n";
452 for(unsigned i = 0; i < length(bucketCounter); ++i)
453 std::cout << (double) bucketCounter[i] / numReads << std::endl;
454 std::cout << std::endl;
455 std::cout << "\nBucket kickout count:\n";
456 for(unsigned i = 0; i < length(kickOutCount); ++i)
457 {
458 if((kickOutCount[i] + bucketCounter[i]) > 0) std::cout << (double) kickOutCount[i] / (kickOutCount[i] + bucketCounter[i]) << std::endl;
459 else std::cout << "0\n";
460 }
461 std::cout << std::endl;
462
463 std::cout << "\nInvalid modification pattern count: "<<inValidModPat<<std::endl;
464 }
465
466 // if (simulateMatePairs == 1) {
467 // write(scan(tmpPath, what = 'character'), file=readPath, sep=std::endl, append = TRUE)
468 // unlink(tmpPath)
469 // }
470 }
471
472 }
473
474 #endif
475