1 /*==========================================================================
2 SeqAn - The Library for Sequence Analysis
3 http://www.seqan.de
4 ============================================================================
5 Copyright (C) 2007
6
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
11
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 ============================================================================
18 $Id$
19 ==========================================================================*/
20
21 #ifndef SEQAN_HEADER_OVERLAP_MODULE_H
22 #define SEQAN_HEADER_OVERLAP_MODULE_H
23 //#define DEBUG_OVERLAP_MODULE
24
25 namespace SEQAN_NAMESPACE_MAIN
26 {
27
28 //////////////////////////////////////////////////////////////////////////////
29 // getIdsFroRead
30 //////////////////////////////////////////////////////////////////////////////
31
32 template<typename TAnnoIds, typename TSpec, typename TConfig, typename TIntervalTree, typename TIntervals>
33 inline void
getIdsForRead(TAnnoIds & ids,FragmentStore<TSpec,TConfig> & me,TIntervalTree & intervalTree,TIntervals & alignIntervals,unsigned offsetInterval)34 getIdsForRead(TAnnoIds & ids, FragmentStore<TSpec, TConfig> & me, TIntervalTree & intervalTree, TIntervals & alignIntervals, unsigned offsetInterval)
35 {
36 typedef typename FragmentStore<TSpec, TConfig>::TContigPos TContigPos;
37 typedef typename FragmentStore<TSpec, TConfig>::TAnnotationStore TAnnotationStore;
38 typedef typename Value<TAnnotationStore>::Type TAnnotationStoreElement;
39 typedef typename TAnnotationStoreElement::TId TId;
40 typedef typename Value<TIntervals>::Type TInterval;
41 typedef String<TId> TResult;
42 typedef typename Iterator<TIntervals >::Type TIntervalIter;
43 typedef typename Iterator<StringSet<TResult > >::Type TResultIter;
44 typedef typename Iterator<TResult >::Type TIdIter;
45
46 static const TId INVALID_ID = TAnnotationStoreElement::INVALID_ID;
47
48 resize(ids, length(alignIntervals));
49
50 TIntervalIter itI = begin(alignIntervals);
51 TIntervalIter itIEnd = end(alignIntervals);
52 TResultIter itR = begin(ids);
53 TResultIter itREnd = end(ids);
54
55 // search mapped annotations for each interval of the aligned read and store them in the corresponding list 'ids'
56 for ( ; itI != itIEnd; goNext(itI), goNext(itR))
57 {
58 findIntervalsForInterval(value(itR), intervalTree, getValue(itI), offsetInterval);
59 }
60
61 // check for each aligment-interval, if the inner interval-borders fit to the borders of the annotation id:
62 itI = begin(alignIntervals);
63 itR = begin(ids);
64 TId currentId;
65 TContigPos beginPos;
66 TContigPos endPos;
67 TContigPos help;
68
69 for ( ; itI != itIEnd; goNext(itI), goNext(itR))
70 {
71 for (unsigned i = 0; i < length(getValue(itR)); ++i)
72 {
73 currentId = getValue(getValue(itR), i);
74 beginPos = getValue(me.annotationStore, currentId).beginPos;
75 endPos = getValue(me.annotationStore, currentId).endPos;
76
77 if (beginPos > endPos)
78 {
79 help = beginPos;
80 beginPos = endPos;
81 endPos = beginPos;
82 }
83 // begin of read
84 if (itR == begin(ids) && length(ids) > 1)
85 {
86 if (static_cast<TContigPos>(getValue(itI).i2 + offsetInterval) < endPos) // if the borders don't fit: delete annotation-id
87 {
88 erase(value(itR), i);
89 --i;
90 }
91 }
92 // end of read
93 else if (position(itR, ids) == endPosition(ids) - 1 && length(ids) > 1u)
94 {
95 if (static_cast<TContigPos>(getValue(itI).i1 - offsetInterval) > beginPos)
96 {
97 erase(value(itR), i);
98 --i;
99 }
100 }
101 // in the middle of the read
102 else if (length(ids) > 2)
103 {
104 if (static_cast<TContigPos>(getValue(itI).i2 + offsetInterval) < endPos)
105 {
106 erase(value(itR), i);
107 --i;
108 }
109 else if (static_cast<TContigPos>(getValue(itI).i1 - offsetInterval) > beginPos)
110 {
111 erase(value(itR), i);
112 --i;
113 }
114 }
115 }
116 if (empty(getValue(itR)) ) // if aligment-interval doesn't fit to any annotation, append INVALID_ID to mark this
117 appendValue(value(itR), INVALID_ID, Generous());
118 }
119 }
120
121
122 //////////////////////////////////////////////////////////////////////////////
123 ////// ReadAnnoStoreELement
124 //////////////////////////////////////////////////////////////////////////////
125 template <typename TId>
126 struct ReadAnnoStoreElement
127 {
128 typedef StringSet<String<TId> > TAnnoIds;
129
130 TAnnoIds annoIds;
131 String<TId> parentIds; // not only for exon-annotations -> more than one parentId possible, only if whole read mapped in parent
132 TId contigId;
133 };
134
135
136 //////////////////////////////////////////////////////////////////////////////
137 ////// assign Ids to ReadAnnoStore
138 //////////////////////////////////////////////////////////////////////////////
139 template<typename TReadAnnoStore, typename TSpec, typename TConfig, typename TId, typename TAnnoIds>
140 inline void
assignToReadAnnoStore(TReadAnnoStore & readAnnoStore,FragmentStore<TSpec,TConfig> & me,TId readId,TAnnoIds & annoIds)141 assignToReadAnnoStore(TReadAnnoStore &readAnnoStore, FragmentStore<TSpec, TConfig> & me, TId readId, TAnnoIds &annoIds)
142 {
143 typedef typename FragmentStore<TSpec, TConfig>::TAnnotationStore TAnnotationStore;
144 typedef typename Value<TAnnotationStore>::Type TAnnotationStoreElement;
145 typedef typename Iterator<TAnnoIds>::Type TAnnoIdsIter;
146 typedef typename Value<TAnnoIds>::Type TIds;
147 typedef typename Iterator<TIds>::Type TIdsIter;
148
149 static const TId INVALID_ID = TAnnotationStoreElement::INVALID_ID;
150
151 // assign annotationIds:
152 value(readAnnoStore, readId).annoIds = annoIds;
153
154 // assign coresponding parentIds:
155 clear (value(readAnnoStore, readId).parentIds);
156 if(!empty(annoIds))
157 {
158 TIdsIter itId = begin(front(annoIds));
159 TIdsIter itIdEnd = end(front(annoIds));
160 for ( ; itId != itIdEnd; goNext(itId)) // read maps in gene, if all intervals map in gene: at least one exon of the gene has to occur in the id-list of the first interval
161 if (getValue(itId) != INVALID_ID && !isElement_unsorted(getValue(me.annotationStore, getValue(itId)).parentId, getValue(readAnnoStore, readId).parentIds))
162 appendValue(value(readAnnoStore, readId).parentIds, getValue(me.annotationStore, getValue(itId)).parentId, Generous() );
163
164 if (!empty(getValue(readAnnoStore, readId).parentIds))
165 {
166 TAnnoIdsIter itA = begin(annoIds);
167 TAnnoIdsIter itAEnd = end(annoIds);
168 goNext(itA);
169 for ( ; itA != itAEnd; goNext(itA)) // not only for exon-annotations -> more than one parentId possible
170 {
171 itId = begin(getValue(itA)); // for each interval of read:
172 itIdEnd = end(getValue(itA));
173 for (unsigned i = 0; i < length(getValue(readAnnoStore, readId).parentIds); ++i) // check if at least one child of the parentId occurs
174 {
175 for ( ; itId != itIdEnd; goNext(itId))
176 {
177 if (getValue(itId) != INVALID_ID && getValue(me.annotationStore, getValue(itId)).parentId == getValue(getValue(readAnnoStore, readId).parentIds, i) )
178 break;
179 }
180 if (itId == itIdEnd) // if not, delete parentId
181 {
182 erase(value(readAnnoStore, readId).parentIds, i);
183 --i;
184 }
185 }
186 }
187 }
188 }
189 }
190
191
192 //////////////////////////////////////////////////////////////////////////////
193 ////// buildTupleCountStore
194 //////////////////////////////////////////////////////////////////////////////
195 template <typename TId>
196 struct TupleCountStoreElement
197 {
198 typedef String<TId> TTuple;
199 typedef String<TTuple > TTupleList;
200 typedef String<unsigned> TTupleCounts;
201 typedef String<double> TTupleNorm;
202
203 TTupleList readConnections;
204 TTupleCounts readConnectionCounts;
205 TTupleNorm readConnectionNorm;
206 TTupleList matePairConnections;
207 TTupleCounts matePairConnectionCounts;
208 TTupleNorm matePairConnectionNorm;
209 };
210
211
212 //////////////////////////////////////////////////////////////////////////////
213 template<typename TTupleCountStore, typename TSpec, typename TConfig, typename TReadAnnoStore>
214 inline void
buildTupleCountStore(TTupleCountStore & tupleCountStore,FragmentStore<TSpec,TConfig> & me,TReadAnnoStore & readAnnoStore,unsigned n,bool exact_nTuple)215 buildTupleCountStore(TTupleCountStore & tupleCountStore,
216 FragmentStore<TSpec, TConfig> & me,
217 TReadAnnoStore & readAnnoStore,
218 unsigned n,
219 bool exact_nTuple)
220 {
221 typedef typename FragmentStore<TSpec, TConfig>::TAnnotationStore TAnnotationStore;
222 typedef typename FragmentStore<TSpec, TConfig>::TContigPos TPos;
223 typedef typename FragmentStore<TSpec, TConfig>::TReadStore TReadStore;
224 typedef typename Value<TReadStore>::Type TReadStoreElement;
225 typedef typename TReadStoreElement::TId TReadId;
226 typedef typename Value<TAnnotationStore>::Type TAnnotationStoreElement;
227 typedef typename TAnnotationStoreElement::TId TId;
228 typedef typename Iterator<TReadAnnoStore>::Type TReadIter;
229 typedef typename Value<TReadAnnoStore>::Type TReadAnnoStoreElement;
230 typedef typename TReadAnnoStoreElement::TAnnoIds TAnnoIds;
231 typedef typename Iterator<TAnnoIds>::Type TAnnoIdsIter;
232 typedef typename Value<TAnnoIds>::Type TIds;
233 typedef typename Iterator<TIds>::Type TIdsIter;
234
235 static const TReadId INVALID_READ_ID = TReadStoreElement::INVALID_ID;
236 static const TId INVALID_ANNO_ID = TAnnotationStoreElement::INVALID_ID;
237
238 resize(tupleCountStore, length(me.annotationStore));
239
240 bool validMate;
241 TReadIter itRead = begin(readAnnoStore);
242 TReadIter itReadEnd = end(readAnnoStore);
243 TIdsIter itP;
244 TIdsIter itPEnd;
245 TAnnoIds annoIds;
246 TAnnoIds tupleSet;
247 TReadId readId;
248 TReadId matePairId;
249 TReadId secReadId;
250 TAnnoIds secTupleSet;
251 TId firstAnnoId1;
252 TId firstAnnoId2;
253 TAnnoIdsIter itTuple;
254 TAnnoIdsIter itTupleEnd;
255 TAnnoIdsIter itSecTuple;
256 TAnnoIdsIter itSecTupleEnd;
257 TAnnoIdsIter itAnnoIds;
258 TAnnoIdsIter itAnnoIdsEnd;
259 TIds matePairTuple;
260 TPos beginPos1;
261 TPos endPos1;
262 TPos beginPos2;
263 TPos endPos2;
264 unsigned pos;
265
266 for ( ; itRead != itReadEnd; goNext(itRead))
267 {
268 if (!empty(getValue(itRead).parentIds) )
269 {
270 itP = begin(getValue(itRead).parentIds);
271 itPEnd = end(getValue(itRead).parentIds);
272 for ( ; itP != itPEnd; goNext(itP) )
273 {
274 validMate = false;
275 // create list of all possible tuples for current read:
276 annoIds = getValue(itRead).annoIds;
277 clear(tupleSet);
278 // create all Tuple of length n:
279 if (exact_nTuple && n <= length(annoIds)) create_nTuple(tupleSet, me, annoIds, getValue(itP), n);
280 // create all max-Tuple (whole read) for current parentId:
281 else if (!exact_nTuple && n == 0) create_nTuple(tupleSet, me, annoIds, getValue(itP), length(annoIds));
282 // create all tuple >= n for current parentId:
283 else if (!exact_nTuple) create_Tuple(tupleSet, me, annoIds, getValue(itP), n);
284 if (!empty(tupleSet))
285 {
286 // create if necessary list of all possible tuples for second matepair-read:
287 readId = position(itRead, readAnnoStore);
288 matePairId = getValue(me.readStore, readId).matePairId;
289 clear(secTupleSet);
290 if (matePairId != INVALID_READ_ID)
291 {
292 if(getValue(getValue(me.matePairStore, matePairId).readId, 0) == readId)
293 secReadId = getValue(getValue(me.matePairStore, matePairId).readId, 1);
294 else
295 secReadId = getValue(getValue(me.matePairStore, matePairId).readId, 0);
296
297 if ( secReadId != INVALID_READ_ID )
298 {
299 //if (!empty(getValue(readAnnoStore, secReadId).annoIds))
300 if ( isElement_unsorted(getValue(itP), getValue(readAnnoStore, secReadId).parentIds) ) // p in parents of matepair? -> annoIds is not empty
301 {
302 validMate = true;
303 annoIds = getValue(readAnnoStore, secReadId).annoIds;
304 firstAnnoId1 = front(front(tupleSet)); // ids necessary to check positions in aligment
305 firstAnnoId2 = front(front(annoIds)); // can't be INVALID_ID, because parents was checked
306
307 // check if current read-position is smaller than the position of the second read -> tuple are ordered by position
308 if ( (getValue(me.annotationStore, firstAnnoId1).beginPos <=
309 getValue(me.annotationStore,firstAnnoId1).endPos &&
310 getValue(me.annotationStore, firstAnnoId1).beginPos <
311 getValue(me.annotationStore, firstAnnoId2).endPos) ||
312 (getValue(me.annotationStore, firstAnnoId1).beginPos >
313 getValue(me.annotationStore, firstAnnoId1).endPos &&
314 getValue(me.annotationStore, firstAnnoId1).endPos <
315 getValue(me.annotationStore, firstAnnoId2).beginPos) )
316 {
317 if (exact_nTuple && n <= length(annoIds)) create_nTuple(secTupleSet, me, annoIds, getValue(itP), n);
318 else if (!exact_nTuple && n == 0) create_nTuple(secTupleSet, me, annoIds, getValue(itP), length(annoIds));
319 else if (!exact_nTuple) create_Tuple(secTupleSet, me, annoIds, getValue(itP), n);
320 }
321 }
322 }
323 }
324 else validMate = true;
325
326 // access to tupleCountStore for all tuple of current read:
327 if (validMate)
328 {
329 itTuple = begin(tupleSet);
330 itTupleEnd = end(tupleSet);
331 for ( ; itTuple != itTupleEnd; goNext(itTuple))
332 {
333 firstAnnoId1 = front(getValue(itTuple));
334 erase(value(itTuple), 0); // first id is not stored; is know by position in tupleCountStore
335
336 // readConnections:
337 if (!empty(getValue(itTuple)))
338 {
339 if (searchValue(pos, getValue(itTuple), getValue(tupleCountStore, firstAnnoId1).readConnections))
340 ++value(value(tupleCountStore, firstAnnoId1).readConnectionCounts, pos);
341 else
342 {
343 if (pos != endPosition(getValue(tupleCountStore, firstAnnoId1).readConnections) )
344 {
345 resizeSpace(value(tupleCountStore, firstAnnoId1).readConnections, 1, pos, pos, Generous());
346 assignValue(value(tupleCountStore, firstAnnoId1).readConnections, pos, getValue(itTuple));
347 insertValue(value(tupleCountStore, firstAnnoId1).readConnectionCounts, pos, 1, Generous());
348 }
349 else
350 {
351 appendValue(value(tupleCountStore, firstAnnoId1).readConnections, getValue(itTuple), Generous());
352 appendValue(value(tupleCountStore, firstAnnoId1).readConnectionCounts, 1, Generous());
353 }
354 }
355 }
356 // matePairConnections:
357 if (!empty(secTupleSet))
358 {
359 itSecTuple = begin(secTupleSet);
360 itSecTupleEnd = end(secTupleSet);
361 for ( ; itSecTuple != itSecTupleEnd; goNext(itSecTuple) )
362 {
363 matePairTuple = getValue(itTuple);
364 // INVALID_ID: sign for connection by matepair (apart from that, there are no INVALID_IDs in the list)
365 appendValue(matePairTuple, INVALID_ANNO_ID, Generous());
366 if (!empty(getValue(itTuple)) && back(getValue(itTuple)) == front(getValue(itSecTuple)) ) // no id 2x allowed
367 {
368 if (exact_nTuple == 0 && n == 0) erase(value(itSecTuple), 0);
369 else continue; // tupel would be created double or tupel wouldn't have the length n anymore
370 }
371 append(matePairTuple, getValue(itSecTuple), Generous());
372
373 if (empty(getValue(itTuple)))
374 {
375 beginPos1 = getValue(me.annotationStore, firstAnnoId1).beginPos;
376 endPos1 = getValue(me.annotationStore, firstAnnoId1).endPos;
377 }
378 else
379 {
380 beginPos1 = getValue(me.annotationStore, back(getValue(itTuple))).beginPos;
381 endPos1 = getValue(me.annotationStore, back(getValue(itTuple))).endPos;
382 }
383 // begin position of first annotation in tuple of second read
384 beginPos2 = getValue(me.annotationStore, front(getValue(itSecTuple))).beginPos;
385 endPos2 = getValue(me.annotationStore, front(getValue(itSecTuple))).endPos;
386 if ( (beginPos1 <= endPos1 && endPos1 < beginPos2) || // no overlapping annotations allowed
387 (endPos1 < beginPos1 && beginPos1 < endPos2) )
388 {
389 if (searchValue(pos, matePairTuple, getValue(tupleCountStore, firstAnnoId1).matePairConnections))
390 ++value(value(tupleCountStore, firstAnnoId1).matePairConnectionCounts, pos);
391 else
392 {
393 if (pos != endPosition(getValue(tupleCountStore, firstAnnoId1).matePairConnections) )
394 {
395 resizeSpace(value(tupleCountStore, firstAnnoId1).matePairConnections, 1, pos, pos, Generous());
396 assignValue(value(tupleCountStore, firstAnnoId1).matePairConnections, pos, matePairTuple);
397 insertValue(value(tupleCountStore, firstAnnoId1).matePairConnectionCounts, pos, 1, Generous());
398 }
399 else
400 {
401 appendValue(value(tupleCountStore, firstAnnoId1).matePairConnections, matePairTuple, Generous());
402 appendValue(value(tupleCountStore, firstAnnoId1).matePairConnectionCounts, 1, Generous());
403 }
404 }
405 }
406 }
407 }
408 }
409 }
410 }
411 }
412 }
413 }
414 }
415
416
417 //////////////////////////////////////////////////////////////////////////////
418 ////// buildAnnoCountStoreg++ -I../seqan/projects/library/ -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -O3 -pedantic -lrt main.cpp -o main
419 //////////////////////////////////////////////////////////////////////////////
420 template<typename TAnnoCountStore, typename TSpec, typename TConfig, typename TReadAnnoStore>
421 inline void
buildAnnoCountStore(TAnnoCountStore & annoCountStore,FragmentStore<TSpec,TConfig> & me,TReadAnnoStore & readAnnoStore)422 buildAnnoCountStore(TAnnoCountStore & annoCountStore, FragmentStore<TSpec, TConfig> & me, TReadAnnoStore & readAnnoStore)
423 {
424 typedef typename Iterator<TReadAnnoStore>::Type TReadIter;
425 typedef typename FragmentStore<TSpec, TConfig>::TReadStore TReadStore;
426 typedef typename Value<TReadStore>::Type TReadStoreElement;
427 typedef typename Value<TReadAnnoStore>::Type TReadAnnoStoreElement;
428 typedef typename TReadAnnoStoreElement::TAnnoIds TAnnoIds;
429 typedef typename Iterator<TAnnoIds>::Type TAnnoIdsIter;
430 typedef typename Value<TAnnoIds>::Type TIds;
431 typedef typename Iterator<TIds>::Type TIdsIter;
432 typedef typename Value<TIds>::Type TId;
433 typedef typename FragmentStore<TSpec, TConfig>::TAnnotationStore TAnnotationStore;
434 typedef typename Value<TAnnotationStore>::Type TAnnotationStoreElement;
435
436 static const TId INVALID_READ_ID = TReadStoreElement::INVALID_ID;
437 static const TId INVALID_ANNO_ID = TAnnotationStoreElement::INVALID_ID;
438
439 resize(annoCountStore, length(me.annotationStore), 0);
440
441 TReadIter itRead = begin(readAnnoStore);
442 TReadIter itReadEnd = end(readAnnoStore);
443 TId readId;
444 TId matePairId;
445 TId secReadId;
446 TIds interSecIds;
447 TIdsIter itP;
448 TIdsIter itPEnd;
449 TAnnoIdsIter itAnnoIds;
450 TAnnoIdsIter itAnnoIdsEnd;
451 TIdsIter itId;
452 TIdsIter itIdEnd;
453
454 // increment for each read respective to its mapped ids the count in the annoCountStore
455 for ( ; itRead != itReadEnd; goNext(itRead))
456 {
457 if (!empty(getValue(itRead).annoIds) )
458 {
459 readId = position(itRead, readAnnoStore);
460 matePairId = getValue(me.readStore, readId).matePairId;
461 if (matePairId != INVALID_READ_ID)
462 {
463 if (getValue(getValue(me.matePairStore, matePairId).readId, 0) == readId)
464 secReadId = getValue(getValue(me.matePairStore, matePairId).readId, 1);
465 else
466 secReadId = getValue(getValue(me.matePairStore, matePairId).readId, 0);
467 }
468 // for each parentId: we just want to count annotations, in which the read mapped
469 if (!empty(getValue(itRead).parentIds))
470 {
471 itP = begin(getValue(itRead).parentIds);
472 itPEnd = end(getValue(itRead).parentIds);
473 for (; itP != itPEnd; goNext(itP) )
474 {
475 // check mate-read to prevent double counts
476 if (matePairId != INVALID_READ_ID)
477 {
478 if (!isElement_unsorted(getValue(itP), getValue(readAnnoStore, secReadId).parentIds) )
479 continue; // if matepair read doesn't map in same parentId: no count (go to next parentId)
480
481 // count annotations, which occur in both reads, shouldn't be increment for the read with the bigger readId
482 if (secReadId < readId )
483 {
484 clear(interSecIds);
485 // just check the periphery annotations
486 // if the end of the current read mapped in a same annotation as the start of the second read:
487 if ( interSec(interSecIds, back(getValue(itRead).annoIds), front(getValue(readAnnoStore, secReadId).annoIds)) )
488 {
489 for (unsigned i = 0; i < length(interSecIds); ++i)
490 {
491 if (getValue(me.annotationStore, getValue(interSecIds, i) ).parentId == getValue(itP))
492 --value(annoCountStore, getValue(interSecIds, i));
493 // decrement the corresponding count
494 }
495 }
496 // or if the start of the current read mapped in a same annotation as the end of the second read:
497 else if ( interSec(interSecIds, front(getValue(itRead).annoIds),
498 back(getValue(readAnnoStore, secReadId).annoIds)) )
499 {
500 for (unsigned i = 0; i < length(interSecIds); ++i)
501 {
502 if (getValue(me.annotationStore, getValue(interSecIds, i) ).parentId == getValue(itP))
503 --value(annoCountStore, getValue(interSecIds, i));
504 }
505 }
506 if (getValue(itP) != INVALID_ANNO_ID) --value(annoCountStore, getValue(itP));
507 }
508 }
509
510 // count for all annoIds
511 itAnnoIds = begin(getValue(itRead).annoIds);
512 itAnnoIdsEnd = end(getValue(itRead).annoIds);
513 for ( ; itAnnoIds != itAnnoIdsEnd; goNext(itAnnoIds))
514 {
515 itId = begin(getValue(itAnnoIds));
516 itIdEnd = end(getValue(itAnnoIds));
517 for ( ; itId != itIdEnd; goNext(itId))
518 if (getValue(itId) != INVALID_ANNO_ID && getValue(me.annotationStore, getValue(itId)).parentId == getValue(itP) )
519 ++value(annoCountStore, getValue(itId));
520 }
521
522 // count for parentIds (already selected)
523 if (getValue(itP) != INVALID_ANNO_ID) ++value(annoCountStore, getValue(itP));
524 }
525 }
526 }
527 }
528 }
529
530
531 //////////////////////////////////////////////////////////////////////////////
532 ////// Overlap Module
533 //////////////////////////////////////////////////////////////////////////////
534 template<typename TReadAnnoStore, typename TAnnoCountStore, typename TTupleCountStore, typename TSpec, typename TConfig>
535 inline void
getResults(TReadAnnoStore & readAnnoStore,TAnnoCountStore & annoCountStore,TTupleCountStore & tupleCountStore,FragmentStore<TSpec,TConfig> & me,unsigned tupelSize,bool exact_nTuple,unsigned offsetInterval,unsigned thresholdGaps,bool unknownO)536 getResults(TReadAnnoStore & readAnnoStore,
537 TAnnoCountStore & annoCountStore,
538 TTupleCountStore & tupleCountStore,
539 FragmentStore<TSpec, TConfig> & me,
540 unsigned tupelSize,
541 bool exact_nTuple,
542 unsigned offsetInterval,
543 unsigned thresholdGaps,
544 bool unknownO)
545 {
546 typedef typename FragmentStore<TSpec, TConfig>::TAnnotationStore TAnnotationStore;
547 typedef typename Value<TAnnotationStore>::Type TAnnotationStoreElement;
548 typedef typename TAnnotationStoreElement::TId TId;
549 typedef typename FragmentStore<TSpec, TConfig>::TIntervalTreeStore TIntervalTreeStore;
550 typedef typename Iterator<TIntervalTreeStore>::Type TIntervalTree;
551 typedef typename Value<TReadAnnoStore>::Type TReadAnnoStoreElement;
552 typedef typename TReadAnnoStoreElement::TAnnoIds TAnnoIds;
553
554 typedef typename FragmentStore<TSpec, TConfig>::TAlignedReadStore TAlignedReadStore;
555 typedef typename Position<TAlignedReadStore>::Type TAlignPos;
556 typedef String<AlignIntervalsStoreElement<> > TAlignIntervalsStore;
557 typedef typename Iterator<TAlignIntervalsStore>::Type TAlignIntervalsStoreIter;
558
559 resize(readAnnoStore, length(me.readStore));
560
561 TIntervalTree intervalTree;
562
563 // extract intervals from alignedReadStore and store them in AlignIntervalsStore:
564 TAlignIntervalsStore alignIntervalsStore;
565 buildAlignIntervalsStore(alignIntervalsStore, me, thresholdGaps);
566
567 if (!empty(alignIntervalsStore))
568 {
569 TAlignPos alignPos;
570 TId contigId;
571 TId readId;
572 TAnnoIds ids;
573
574 TAlignIntervalsStoreIter it = begin(alignIntervalsStore);
575 TAlignIntervalsStoreIter itEnd = end(alignIntervalsStore);
576 // for each item in alignIntervalsStore:
577 for ( ; it != itEnd; goNext(it))
578 {
579 // get ids from alignedReadStore (same position as in alignIntervalsStore):
580 alignPos = position(it, alignIntervalsStore);
581 contigId = getValue(me.alignedReadStore, alignPos).contigId;
582 readId = getValue(me.alignedReadStore, alignPos).readId;
583 // get respective intervalTree
584 if (unknownO || getValue(me.alignedReadStore, alignPos).beginPos <= getValue(me.alignedReadStore, alignPos).endPos)
585 intervalTree = begin(me.intervalTreeStore_F, Standard()) + contigId; //getValue(me.intervalTreeStore_F, contigId);
586 else
587 intervalTree = begin(me.intervalTreeStore_R, Standard()) + contigId; //getValue(me.intervalTreeStore_R, contigId);
588
589 // get annotationStore-Ids for these intervals:
590 clear(ids);
591 if ((*intervalTree).interval_counter != 0)
592 getIdsForRead(ids, me, *intervalTree, getValue(it).intervals, offsetInterval);
593 // assign Ids from mapped annotations to readAnnoStore:
594 value(readAnnoStore, readId).contigId = contigId;
595 assignToReadAnnoStore(readAnnoStore, me, readId, ids);
596 }
597 }
598 buildAnnoCountStore(annoCountStore, me, readAnnoStore);
599 buildTupleCountStore(tupleCountStore, me, readAnnoStore, tupelSize, exact_nTuple);
600 }
601
602
603 //////////////////////////////////////////////////////////////////////////////
604 /// get normalized values for annotations And get Map for Gene orientations (necessary for annotation Output)
605 //////////////////////////////////////////////////////////////////////////////
606 template<typename TAnnoNormStore, typename TMapO, typename TAnnoCountStore, typename TSpec, typename TConfig>
607 inline void
normalizeAnnoCounts(TAnnoNormStore & annoNormStore,TMapO & mapO,TAnnoCountStore & annoCountStore,FragmentStore<TSpec,TConfig> & me)608 normalizeAnnoCounts(TAnnoNormStore &annoNormStore, TMapO &mapO, TAnnoCountStore &annoCountStore, FragmentStore<TSpec, TConfig> &me)
609 {
610 typedef typename FragmentStore<TSpec, TConfig>::TAnnotationStore TAnnotationStore;
611 typedef typename Value<TAnnotationStore>::Type TAnnotationStoreElement;
612 typedef typename TAnnotationStoreElement::TId TId;
613 typedef typename FragmentStore<TSpec, TConfig>::TContigPos TPos;
614 typedef typename FragmentStore<TSpec, TConfig>::TReadStore TReadStore;
615 typedef typename Size<TReadStore>::Type TReadStoreSize;
616 typedef typename Iterator<TAnnotationStore>::Type TAnnoIter;
617 typedef typename Iterator<TAnnoCountStore>::Type TCountIter;
618 typedef typename Iterator<TAnnoNormStore>::Type TNormIter;
619 typedef typename Size<TPos>::Type TSize;
620 typedef String<TSize> TChildrenLengths;
621 typedef typename Iterator<String<TSize> >::Type TLengthIter;
622 typedef Pair<TId, TChildrenLengths> TPair;
623 typedef typename Value<TMapO>::Type TPairO;
624 typedef Map<TPair> TMap;
625 typedef typename Iterator<TMap>::Type TMapIter;
626
627 static const TId INVALID_ID = TAnnotationStoreElement::INVALID_ID;
628 static const TPos INVALID_POS = TAnnotationStoreElement::INVALID_POS;
629
630 resize(annoNormStore, length(annoCountStore), 0);
631
632 TReadStoreSize readNo = length(me.readStore) - length(me.matePairStore);
633
634 TMap map;
635 clear(map);
636 clear(mapO);
637
638 if(!empty(me.annotationStore))
639 {
640 TAnnoIter itA = begin(me.annotationStore);
641 TAnnoIter itAEnd = end(me.annotationStore);
642 TCountIter itC = begin(annoCountStore);
643 TNormIter itN = begin(annoNormStore);
644 TChildrenLengths childrenLengths;
645 TPair pair;
646 TPairO pairO;
647 TSize length;
648
649 for ( ; itA != itAEnd; goNext(itA), goNext(itC), goNext(itN) )
650 {
651 if (getValue(itA).beginPos == INVALID_POS && getValue(itA).parentId == INVALID_ID) // make entry for each gene/parent in map:
652 {
653 clear(childrenLengths);
654 pair.i1 = position(itA, me.annotationStore);
655 pair.i2 = childrenLengths;
656 insert(map, pair); // for lengths
657
658 pairO.i1 = position(itA, me.annotationStore);
659 pairO.i2 = 0;
660 insert(mapO, pairO); // for orientation
661 }
662 else if (getValue(itA).beginPos != INVALID_POS) // for each exon/child:
663 {
664 if (getValue(itA).beginPos <= getValue(itA).endPos)
665 length = getValue(itA).endPos - getValue(itA).beginPos + 1;
666 else
667 length = getValue(itA).beginPos - getValue(itA).endPos + 1;
668
669 value(itN) = ((double)1000000000 * (double)getValue(itC))/((double)readNo * (double)length); // calculate normalized expression-value
670
671 if (getValue(itA).parentId != INVALID_ID) // append length to gene/parent lengths
672 {
673 appendValue(mapValue(map, getValue(itA).parentId), length, Generous());
674 if (getValue(itA).beginPos > getValue(itA).endPos)
675 mapValue(mapO, getValue(itA).parentId) = 1;
676 }
677 }
678 }
679 }
680
681 if (!empty(map))
682 {
683 TMapIter itM = begin(map);
684 TMapIter itMEnd = end(map);
685 TSize length;
686 TLengthIter itL;
687 TLengthIter itLEnd;
688 for ( ; itM != itMEnd; goNext(itM)) // calculate normalized gene/parent expression-values
689 {
690 length = 0;
691 itL = begin(value(itM).i2);
692 itLEnd = end(value(itM).i2);
693 for ( ; itL != itLEnd; goNext(itL))
694 length += getValue(itL);
695 value(annoNormStore, value(itM).i1) = ((double)1000000000 * (double)getValue(annoCountStore, value(itM).i1) )/((double)readNo * (double)length);
696 }
697 }
698 }
699
700
701 //////////////////////////////////////////////////////////////////////////////
702 /// get normalized values for tuple
703 //////////////////////////////////////////////////////////////////////////////
704 template<typename TTupleCountStore, typename TSpec, typename TConfig>
705 inline void
normalizeTupleCounts(TTupleCountStore & tupleCountStore,FragmentStore<TSpec,TConfig> & me)706 normalizeTupleCounts(TTupleCountStore &tupleCountStore, FragmentStore<TSpec, TConfig> &me)
707 {
708 typedef typename FragmentStore<TSpec, TConfig>::TAnnotationStore TAnnotationStore;
709 typedef typename Value<TAnnotationStore>::Type TAnnotationStoreElement;
710 typedef typename TAnnotationStoreElement::TId TId;
711 typedef typename Value<TTupleCountStore>::Type TTupleCountStoreElement;
712 typedef typename TTupleCountStoreElement::TTupleList TTupleList;
713 typedef typename TTupleCountStoreElement::TTupleCounts TTupleCounts;
714 typedef typename TTupleCountStoreElement::TTupleNorm TTupleNorm;
715 typedef typename TTupleCountStoreElement::TTuple TTuple;
716 typedef typename Iterator<TTupleCountStore>::Type TStoreIter;
717 typedef typename Iterator<TTupleList>::Type TTupleListIter;
718 typedef typename Iterator<TTupleCounts>::Type TCountIter;
719 typedef typename Iterator<TTupleNorm>::Type TNormIter;
720 typedef typename Iterator<TTuple>::Type TTupleIter;
721 typedef typename FragmentStore<TSpec, TConfig>::TContigPos TPos;
722 typedef typename Size<TPos>::Type TSize;
723 typedef typename FragmentStore<TSpec, TConfig>::TReadStore TReadStore;
724 typedef typename Size<TReadStore>::Type TReadStoreSize;
725
726 static const TId INVALID_ID = TAnnotationStoreElement::INVALID_ID;
727
728 TReadStoreSize readNo = length(me.readStore) - length(me.matePairStore);
729
730 if (!empty(tupleCountStore))
731 {
732 TStoreIter itS = begin(tupleCountStore);
733 TStoreIter itSEnd = end(tupleCountStore);
734 TTupleListIter itT;
735 TTupleListIter itTEnd;
736 TCountIter itC;
737 TNormIter itN;
738 TSize tupleLength;
739 TTupleIter itId;
740 TTupleIter itIdEnd;
741 for ( ; itS != itSEnd; goNext(itS))
742 {
743 // readConnections:
744 resize(value(itS).readConnectionNorm, length(getValue(itS).readConnections));
745 if (!empty(getValue(itS).readConnections))
746 {
747 itT = begin(getValue(itS).readConnections);
748 itTEnd = end(getValue(itS).readConnections);
749 itC = begin(getValue(itS).readConnectionCounts);
750 itN = begin(getValue(itS).readConnectionNorm);
751 for ( ; itT != itTEnd; goNext(itT), goNext(itC), goNext(itN))
752 {
753 tupleLength = 0;
754 itId = begin(getValue(itT));
755 itIdEnd = end(getValue(itT));
756 for ( ; itId != itIdEnd; goNext(itId))
757 {
758 if (getValue(me.annotationStore, getValue(itId)).beginPos <= getValue(me.annotationStore, getValue(itId)).endPos)
759 tupleLength += getValue(me.annotationStore, getValue(itId)).endPos - getValue(me.annotationStore, getValue(itId)).beginPos + 1;
760 else
761 tupleLength += getValue(me.annotationStore, getValue(itId)).beginPos - getValue(me.annotationStore, getValue(itId)).endPos + 1;
762 }
763 value(itN) = ((double)1000000000 * (double)getValue(itC)) / ((double)readNo * (double)tupleLength);
764 }
765 }
766 // matePairConnections:
767 resize(value(itS).matePairConnectionNorm, length(getValue(itS).matePairConnections));
768 if (!empty(getValue(itS).matePairConnections))
769 {
770 itT = begin(getValue(itS).matePairConnections);
771 itTEnd = end(getValue(itS).matePairConnections);
772 itC = begin(getValue(itS).matePairConnectionCounts);
773 itN = begin(getValue(itS).matePairConnectionNorm);
774 for ( ; itT != itTEnd; goNext(itT), goNext(itC), goNext(itN))
775 {
776 tupleLength = 0;
777 itId = begin(getValue(itT));
778 itIdEnd = end(getValue(itT));
779 for ( ; itId != itIdEnd; goNext(itId))
780 {
781 if (getValue(itId) != INVALID_ID)
782 {
783 if (getValue(me.annotationStore, getValue(itId)).beginPos <= getValue(me.annotationStore, getValue(itId)).endPos)
784 tupleLength += getValue(me.annotationStore, getValue(itId)).endPos - getValue(me.annotationStore, getValue(itId)).beginPos + 1;
785 else
786 tupleLength += getValue(me.annotationStore, getValue(itId)).beginPos - getValue(me.annotationStore, getValue(itId)).endPos + 1;
787 }
788 }
789 value(itN) = ((double)1000000000 * (double)getValue(itC)) / ((double)readNo * (double)tupleLength);
790 }
791 }
792 }
793 }
794 }
795
796
797 //////////////////////////////////////////////////////////////////////////////
798 /// NGS Overlapper main function
799 //////////////////////////////////////////////////////////////////////////////
800
801 inline bool
ngsOverlapper(CharString const & nameSAM,CharString const & nameGFF,CharString const & outputPath,unsigned tupleValue,bool exact_nTuple,unsigned thresholdGaps,unsigned offsetInterval,unsigned thresholdCount,double thresholdRPKM,bool unknownO,bool fusion,bool gtf)802 ngsOverlapper(CharString const &nameSAM, CharString const &nameGFF, CharString const &outputPath,
803 unsigned tupleValue, bool exact_nTuple,
804 unsigned thresholdGaps, unsigned offsetInterval,
805 unsigned thresholdCount, double thresholdRPKM,
806 bool unknownO,
807 bool fusion,
808 bool gtf)
809 {
810 FragmentStore<> me;
811 #ifdef DEBUG_OVERLAP_MODULE
812 SEQAN_PROTIMESTART(find1_time);
813 #endif
814 // build contigStore from FASTA file
815 #ifdef DEBUG_OVERLAP_MODULE
816 std::cout << "load Sam..." << std::endl;
817 #endif
818 // read aligned reads in FragmentStore from Sam files
819 std::fstream fileSAM;
820 fileSAM.open(toCString(nameSAM), std::ios_base::in | std::ios_base::binary);
821 if(!fileSAM.is_open()) return false;
822 read(fileSAM, me, Sam());
823 fileSAM.close();
824
825 // readAnnotations from Gff or Gtf:
826 if (gtf == 0)
827 {
828 #ifdef DEBUG_OVERLAP_MODULE
829 SEQAN_PROTIMESTART(find2_time);
830 std::cout << "load Gff..." << std::endl;
831 #endif
832 readAnnotationsFromGFF(me, toCString(nameGFF));
833 }
834 else
835 {
836 #ifdef DEBUG_OVERLAP_MODULE
837 SEQAN_PROTIMESTART(find2_time);
838 std::cout << "load Gff..." << std::endl;
839 #endif
840 readAnnotationsFromGTF(me, toCString(nameGFF));
841 }
842
843 // create IntervalTreeStore:
844 #ifdef DEBUG_OVERLAP_MODULE
845 SEQAN_PROTIMESTART(find3_time);
846 #endif
847 createIntervalTreeStore(me, unknownO);
848 #ifdef DEBUG_OVERLAP_MODULE
849 std::cout << "create intervalTreeStores from annotationStore took: \t" << SEQAN_PROTIMEDIFF(find3_time) << " seconds" << std::endl;
850 #endif
851
852 // build stores for results:
853 String<ReadAnnoStoreElement<unsigned> > readAnnoStore;
854 String<unsigned> annoCountStore;
855 String<TupleCountStoreElement<unsigned> > tupleCountStore;
856 String<TupleCountStoreElement_Fusion<unsigned> > tupleCountStore_Fusion; // additional Store, if fusion genes should be checked
857
858
859
860 // get results with additional check for transfusion genes (will be changed later additionally)
861 if (fusion == 1)
862 getResults_Fusion(readAnnoStore, annoCountStore, tupleCountStore, tupleCountStore_Fusion, me, tupleValue, exact_nTuple, offsetInterval, thresholdGaps, unknownO);
863 else // get normal results:
864 getResults(readAnnoStore, annoCountStore, tupleCountStore, me, tupleValue, exact_nTuple, offsetInterval, thresholdGaps, unknownO);
865
866
867 // normalize:
868 String<double> annoNormStore;
869 Map<Pair<unsigned, bool> > mapO;
870 normalizeAnnoCounts(annoNormStore, mapO, annoCountStore, me);
871 normalizeTupleCounts(tupleCountStore, me);
872 if (fusion == 1)
873 normalizeTupleCounts_Fusion(tupleCountStore_Fusion, me);
874
875
876 // output:
877 CharString pathReadOutput = outputPath;
878 append(pathReadOutput, "readOutput.gff");
879 std::fstream readOutput;
880 readOutput.open(toCString(pathReadOutput), std::ios_base::out | std::ios_base::trunc | std::ios_base::binary);
881 createReadCountGFF(readOutput, readAnnoStore, me);
882 readOutput.close();
883
884 CharString pathAnnoOutput = outputPath;
885 append(pathAnnoOutput, "annoOutput.gff");
886 std::fstream annoOutput;
887 annoOutput.open(toCString(pathAnnoOutput), std::ios_base::out | std::ios_base::trunc | std::ios_base::binary);
888 createAnnoCountGFF(annoOutput, annoCountStore, annoNormStore, me, mapO);
889 annoOutput.close();
890
891 CharString pathTupleOutput = outputPath;
892 append(pathTupleOutput, "tupleOutput.gff");
893 std::fstream tupleOutput;
894 tupleOutput.open(toCString(pathTupleOutput), std::ios_base::out | std::ios_base::trunc | std::ios_base::binary);
895 createTupleCountGFF(tupleOutput, tupleCountStore, me, thresholdCount, thresholdRPKM);
896 tupleOutput.close();
897
898 // additional output, if fusion genes were checked
899 if (fusion == 1)
900 {
901 CharString pathTupleOutput_Fusion = outputPath;
902 append(pathTupleOutput_Fusion, "tupleOutput_Fusion.gff");
903 std::fstream tupleOutput_Fusion;
904 tupleOutput_Fusion.open(toCString(pathTupleOutput_Fusion), std::ios_base::out | std::ios_base::trunc | std::ios_base::binary);
905 createTupleCountGFF_Fusion(tupleOutput_Fusion, tupleCountStore_Fusion, me, thresholdCount, thresholdRPKM);
906 tupleOutput_Fusion.close();
907 }
908
909
910 #ifdef DEBUG_OVERLAP_MODULE
911 std::cout << "ngsOverlapper-function took: \t" << SEQAN_PROTIMEDIFF(find1_time) << " seconds" << std::endl;
912 std::cout << "ngsOverlapper-function without reading Sam took: \t" << SEQAN_PROTIMEDIFF(find2_time) << " seconds" << std::endl;
913 std::cout << "ngsOverlapper-function and create IntervalTreeStore without reading Sam took:\t" << SEQAN_PROTIMEDIFF(find2_time) - SEQAN_PROTIMEDIFF(find3_time) << " seconds" << std::endl;
914 #endif
915 return true;
916 }
917
918 //////////////////////////////////////////////////////////////////////////////
919
920 }// namespace SEQAN_NAMESPACE_MAIN
921
922 #endif //#ifndef SEQAN_HEADER_...
923