1  /*==========================================================================
2                 SeqAn - The Library for Sequence Analysis
3                           http://www.seqan.de
4  ============================================================================
5   Copyright (C) 2007
6 
7   This library is free software; you can redistribute it and/or
8   modify it under the terms of the GNU Lesser General Public
9   License as published by the Free Software Foundation; either
10   version 3 of the License, or (at your option) any later version.
11 
12   This library is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15   Lesser General Public License for more details.
16 
17  ============================================================================
18   $Id$
19  ==========================================================================*/
20 
21 #ifndef SEQAN_HEADER_OVERLAP_MODULE_H
22 #define SEQAN_HEADER_OVERLAP_MODULE_H
23 //#define DEBUG_OVERLAP_MODULE
24 
25 namespace SEQAN_NAMESPACE_MAIN
26 {
27 
28 //////////////////////////////////////////////////////////////////////////////
29 // getIdsFroRead
30 //////////////////////////////////////////////////////////////////////////////
31 
32 template<typename TAnnoIds, typename TSpec, typename TConfig, typename TIntervalTree, typename TIntervals>
33 inline void
getIdsForRead(TAnnoIds & ids,FragmentStore<TSpec,TConfig> & me,TIntervalTree & intervalTree,TIntervals & alignIntervals,unsigned offsetInterval)34 getIdsForRead(TAnnoIds & ids, FragmentStore<TSpec, TConfig> & me, TIntervalTree & intervalTree, TIntervals & alignIntervals, unsigned offsetInterval)
35 {
36 	typedef typename FragmentStore<TSpec, TConfig>::TContigPos 		TContigPos;
37 	typedef typename FragmentStore<TSpec, TConfig>::TAnnotationStore 	TAnnotationStore;
38 	typedef typename Value<TAnnotationStore>::Type 				TAnnotationStoreElement;
39 	typedef typename TAnnotationStoreElement::TId 				TId;
40 	typedef typename Value<TIntervals>::Type 				TInterval;
41 	typedef 	 String<TId> 						TResult;
42 	typedef typename Iterator<TIntervals >::Type 				TIntervalIter;
43 	typedef typename Iterator<StringSet<TResult > >::Type			TResultIter;
44 	typedef typename Iterator<TResult >::Type				TIdIter;
45 
46 	static const TId INVALID_ID = TAnnotationStoreElement::INVALID_ID;
47 
48 	resize(ids, length(alignIntervals));
49 
50 	TIntervalIter itI = begin(alignIntervals);
51 	TIntervalIter itIEnd = end(alignIntervals);
52 	TResultIter itR = begin(ids);
53 	TResultIter itREnd = end(ids);
54 
55 	// search mapped annotations for each interval of the aligned read and store them in the corresponding list 'ids'
56 	for ( ; itI != itIEnd; goNext(itI), goNext(itR))
57 	{
58 		findIntervalsForInterval(value(itR), intervalTree, getValue(itI), offsetInterval);
59 	}
60 
61 	// check for each aligment-interval, if the inner interval-borders fit to the borders of the annotation id:
62 	itI = begin(alignIntervals);
63 	itR = begin(ids);
64 	TId currentId;
65 	TContigPos beginPos;
66 	TContigPos endPos;
67 	TContigPos help;
68 
69 	for ( ; itI != itIEnd; goNext(itI), goNext(itR))
70 	{
71 		for (unsigned i = 0; i < length(getValue(itR)); ++i)
72 		{
73 			currentId = getValue(getValue(itR), i);
74 			beginPos = getValue(me.annotationStore, currentId).beginPos;
75 			endPos = getValue(me.annotationStore, currentId).endPos;
76 
77 			if (beginPos > endPos)
78 			{
79 				help = beginPos;
80 				beginPos = endPos;
81 				endPos = beginPos;
82 			}
83 			// begin of read
84 			if (itR == begin(ids) && length(ids) > 1)
85 			{
86 				if (static_cast<TContigPos>(getValue(itI).i2 + offsetInterval) < endPos) // if the borders don't fit: delete annotation-id
87 				{
88 					erase(value(itR), i);
89 					--i;
90 				}
91 			}
92 			// end of read
93 			else if (position(itR, ids) == endPosition(ids) - 1 && length(ids) > 1u)
94 			{
95 				if (static_cast<TContigPos>(getValue(itI).i1 - offsetInterval) > beginPos)
96 				{
97 					erase(value(itR), i);
98 					--i;
99 				}
100 			}
101 			// in the middle of the read
102 			else if (length(ids) > 2)
103 			{
104 				if (static_cast<TContigPos>(getValue(itI).i2 + offsetInterval) < endPos)
105 				{
106 					erase(value(itR), i);
107 					--i;
108 				}
109 				else if (static_cast<TContigPos>(getValue(itI).i1 - offsetInterval) > beginPos)
110 				{
111 					erase(value(itR), i);
112 					--i;
113 				}
114 			}
115 		}
116 		if (empty(getValue(itR)) )  // if aligment-interval doesn't fit to any annotation, append INVALID_ID to mark this
117 			appendValue(value(itR), INVALID_ID, Generous());
118 	}
119 }
120 
121 
122 //////////////////////////////////////////////////////////////////////////////
123 ////// ReadAnnoStoreELement
124 //////////////////////////////////////////////////////////////////////////////
125 template <typename TId>
126 struct ReadAnnoStoreElement
127 {
128 	typedef StringSet<String<TId> > TAnnoIds;
129 
130 	TAnnoIds	annoIds;
131 	String<TId> 	parentIds;     // not only for exon-annotations -> more than one parentId possible, only if whole read mapped in parent
132 	TId		contigId;
133 };
134 
135 
136 //////////////////////////////////////////////////////////////////////////////
137 ////// assign Ids to ReadAnnoStore
138 //////////////////////////////////////////////////////////////////////////////
139 template<typename TReadAnnoStore, typename TSpec, typename TConfig, typename TId, typename TAnnoIds>
140 inline void
assignToReadAnnoStore(TReadAnnoStore & readAnnoStore,FragmentStore<TSpec,TConfig> & me,TId readId,TAnnoIds & annoIds)141 assignToReadAnnoStore(TReadAnnoStore &readAnnoStore, FragmentStore<TSpec, TConfig> & me, TId readId, TAnnoIds &annoIds)
142 {
143 	typedef typename FragmentStore<TSpec, TConfig>::TAnnotationStore 	TAnnotationStore;
144 	typedef typename Value<TAnnotationStore>::Type 				TAnnotationStoreElement;
145 	typedef typename Iterator<TAnnoIds>::Type				TAnnoIdsIter;
146 	typedef typename Value<TAnnoIds>::Type					TIds;
147 	typedef typename Iterator<TIds>::Type					TIdsIter;
148 
149 	static const TId INVALID_ID = TAnnotationStoreElement::INVALID_ID;
150 
151 	// assign annotationIds:
152 	value(readAnnoStore, readId).annoIds = annoIds;
153 
154 	// assign coresponding parentIds:
155 	clear (value(readAnnoStore, readId).parentIds);
156 	if(!empty(annoIds))
157 	{
158 		TIdsIter itId = begin(front(annoIds));
159 		TIdsIter itIdEnd = end(front(annoIds));
160 		for ( ; itId != itIdEnd; goNext(itId))		// read maps in gene, if all intervals map in gene: at least one exon of the gene has to occur in the id-list of the first interval
161 			if (getValue(itId) != INVALID_ID && !isElement_unsorted(getValue(me.annotationStore, getValue(itId)).parentId, getValue(readAnnoStore, readId).parentIds))
162 				appendValue(value(readAnnoStore, readId).parentIds, getValue(me.annotationStore, getValue(itId)).parentId, Generous() );
163 
164 		if (!empty(getValue(readAnnoStore, readId).parentIds))
165 		{
166 			TAnnoIdsIter itA = begin(annoIds);
167 			TAnnoIdsIter itAEnd = end(annoIds);
168 			goNext(itA);
169 			for ( ; itA != itAEnd; goNext(itA))	// not only for exon-annotations -> more than one parentId possible
170 			{
171 				itId = begin(getValue(itA));		// for each interval of read:
172 				itIdEnd = end(getValue(itA));
173 				for (unsigned i = 0; i < length(getValue(readAnnoStore, readId).parentIds); ++i) // check if at least one child of the parentId occurs
174 				{
175 					for ( ; itId != itIdEnd; goNext(itId))
176 					{
177 						if (getValue(itId) != INVALID_ID && getValue(me.annotationStore, getValue(itId)).parentId == getValue(getValue(readAnnoStore, readId).parentIds, i) )
178 							break;
179 					}
180 					if (itId == itIdEnd)			 // if not, delete parentId
181 					{
182 						erase(value(readAnnoStore, readId).parentIds, i);
183 						--i;
184 					}
185 				}
186 			}
187 		}
188 	}
189 }
190 
191 
192 //////////////////////////////////////////////////////////////////////////////
193 ////// buildTupleCountStore
194 //////////////////////////////////////////////////////////////////////////////
195 template <typename TId>
196 struct TupleCountStoreElement
197 {
198 	typedef String<TId>			TTuple;
199 	typedef String<TTuple > 		TTupleList;
200 	typedef String<unsigned>		TTupleCounts;
201 	typedef String<double>			TTupleNorm;
202 
203 	TTupleList		readConnections;
204 	TTupleCounts		readConnectionCounts;
205 	TTupleNorm		readConnectionNorm;
206 	TTupleList		matePairConnections;
207 	TTupleCounts		matePairConnectionCounts;
208 	TTupleNorm		matePairConnectionNorm;
209 };
210 
211 
212 //////////////////////////////////////////////////////////////////////////////
213 template<typename TTupleCountStore, typename TSpec, typename TConfig, typename TReadAnnoStore>
214 inline void
buildTupleCountStore(TTupleCountStore & tupleCountStore,FragmentStore<TSpec,TConfig> & me,TReadAnnoStore & readAnnoStore,unsigned n,bool exact_nTuple)215 buildTupleCountStore(TTupleCountStore & tupleCountStore,
216 		     FragmentStore<TSpec, TConfig> &  me,
217 		     TReadAnnoStore & readAnnoStore,
218 		     unsigned n,
219 		     bool exact_nTuple)
220 {
221 	typedef typename FragmentStore<TSpec, TConfig>::TAnnotationStore 	TAnnotationStore;
222 	typedef typename FragmentStore<TSpec, TConfig>::TContigPos		TPos;
223 	typedef typename FragmentStore<TSpec, TConfig>::TReadStore 		TReadStore;
224 	typedef typename Value<TReadStore>::Type 				TReadStoreElement;
225 	typedef typename TReadStoreElement::TId					TReadId;
226 	typedef typename Value<TAnnotationStore>::Type 				TAnnotationStoreElement;
227 	typedef typename TAnnotationStoreElement::TId 				TId;
228 	typedef typename Iterator<TReadAnnoStore>::Type 			TReadIter;
229 	typedef typename Value<TReadAnnoStore>::Type				TReadAnnoStoreElement;
230 	typedef typename TReadAnnoStoreElement::TAnnoIds			TAnnoIds;
231 	typedef typename Iterator<TAnnoIds>::Type 				TAnnoIdsIter;
232 	typedef typename Value<TAnnoIds>::Type					TIds;
233 	typedef typename Iterator<TIds>::Type					TIdsIter;
234 
235 	static const TReadId INVALID_READ_ID = TReadStoreElement::INVALID_ID;
236 	static const TId INVALID_ANNO_ID = TAnnotationStoreElement::INVALID_ID;
237 
238 	resize(tupleCountStore, length(me.annotationStore));
239 
240 	bool validMate;
241 	TReadIter itRead = begin(readAnnoStore);
242 	TReadIter itReadEnd = end(readAnnoStore);
243 	TIdsIter itP;
244 	TIdsIter itPEnd;
245 	TAnnoIds annoIds;
246 	TAnnoIds tupleSet;
247 	TReadId readId;
248 	TReadId matePairId;
249 	TReadId secReadId;
250 	TAnnoIds secTupleSet;
251 	TId firstAnnoId1;
252 	TId firstAnnoId2;
253 	TAnnoIdsIter itTuple;
254 	TAnnoIdsIter itTupleEnd;
255 	TAnnoIdsIter itSecTuple;
256 	TAnnoIdsIter itSecTupleEnd;
257 	TAnnoIdsIter itAnnoIds;
258 	TAnnoIdsIter itAnnoIdsEnd;
259 	TIds matePairTuple;
260 	TPos beginPos1;
261 	TPos endPos1;
262 	TPos beginPos2;
263 	TPos endPos2;
264 	unsigned pos;
265 
266 	for ( ; itRead != itReadEnd; goNext(itRead))
267 	{
268 		if (!empty(getValue(itRead).parentIds) )
269 		{
270 			itP = begin(getValue(itRead).parentIds);
271 			itPEnd = end(getValue(itRead).parentIds);
272 			for ( ; itP != itPEnd; goNext(itP) )
273 			{
274 				validMate = false;
275 				// create list of all possible tuples for current read:
276 				annoIds = getValue(itRead).annoIds;
277 				clear(tupleSet);
278 				// create all Tuple of length n:
279 				if (exact_nTuple && n <= length(annoIds)) create_nTuple(tupleSet, me, annoIds, getValue(itP), n);
280 				// create all max-Tuple (whole read) for current parentId:
281 				else if (!exact_nTuple && n == 0) create_nTuple(tupleSet, me, annoIds, getValue(itP), length(annoIds));
282 				// create all tuple >= n for current parentId:
283 				else if (!exact_nTuple) create_Tuple(tupleSet, me, annoIds, getValue(itP), n);
284 				if (!empty(tupleSet))
285 				{
286 					// create if necessary list of all possible tuples for second matepair-read:
287 					readId = position(itRead, readAnnoStore);
288 					matePairId = getValue(me.readStore, readId).matePairId;
289 					clear(secTupleSet);
290 					if (matePairId != INVALID_READ_ID)
291 					{
292 						if(getValue(getValue(me.matePairStore, matePairId).readId, 0) == readId)
293 							secReadId = getValue(getValue(me.matePairStore, matePairId).readId, 1);
294 						else
295 							secReadId = getValue(getValue(me.matePairStore, matePairId).readId, 0);
296 
297 						if ( secReadId != INVALID_READ_ID )
298 						{
299 							//if (!empty(getValue(readAnnoStore, secReadId).annoIds))
300 							if ( isElement_unsorted(getValue(itP), getValue(readAnnoStore, secReadId).parentIds) )	// p in parents of matepair? -> annoIds is not empty
301 							{
302 								validMate = true;
303 								annoIds = getValue(readAnnoStore, secReadId).annoIds;
304 								firstAnnoId1 = front(front(tupleSet));	// ids necessary to check positions in aligment
305 								firstAnnoId2 = front(front(annoIds));	// can't be INVALID_ID, because parents was checked
306 
307 								// check if current read-position is smaller than the position of the second read -> tuple are ordered by position
308 								if ( (getValue(me.annotationStore, firstAnnoId1).beginPos <=
309 									getValue(me.annotationStore,firstAnnoId1).endPos &&
310 								      getValue(me.annotationStore, firstAnnoId1).beginPos <
311 								      	getValue(me.annotationStore, firstAnnoId2).endPos) ||
312 								     (getValue(me.annotationStore, firstAnnoId1).beginPos >
313 								     	getValue(me.annotationStore, firstAnnoId1).endPos &&
314 								      getValue(me.annotationStore, firstAnnoId1).endPos <
315 								      	getValue(me.annotationStore, firstAnnoId2).beginPos)  )
316 								{
317 									if (exact_nTuple && n <= length(annoIds)) create_nTuple(secTupleSet, me, annoIds, getValue(itP), n);
318 									else if (!exact_nTuple && n == 0) create_nTuple(secTupleSet, me, annoIds, getValue(itP), length(annoIds));
319 									else if (!exact_nTuple) create_Tuple(secTupleSet, me, annoIds, getValue(itP), n);
320 								}
321 							}
322 						}
323 					}
324 					else validMate = true;
325 
326 					// access to tupleCountStore for all tuple of current read:
327 					if (validMate)
328 					{
329 						itTuple = begin(tupleSet);
330 						itTupleEnd = end(tupleSet);
331 						for ( ; itTuple != itTupleEnd; goNext(itTuple))
332 						{
333 							firstAnnoId1 = front(getValue(itTuple));
334 							erase(value(itTuple), 0);			// first id is not stored; is know by position in tupleCountStore
335 
336 							// readConnections:
337 							if (!empty(getValue(itTuple)))
338 							{
339 								if (searchValue(pos, getValue(itTuple), getValue(tupleCountStore, firstAnnoId1).readConnections))
340 									++value(value(tupleCountStore, firstAnnoId1).readConnectionCounts, pos);
341 								else
342 								{
343 									if (pos != endPosition(getValue(tupleCountStore, firstAnnoId1).readConnections) )
344 									{
345 										resizeSpace(value(tupleCountStore, firstAnnoId1).readConnections, 1, pos, pos, Generous());
346 										assignValue(value(tupleCountStore, firstAnnoId1).readConnections, pos, getValue(itTuple));
347 										insertValue(value(tupleCountStore, firstAnnoId1).readConnectionCounts, pos, 1, Generous());
348 									}
349 									else
350 									{
351 										appendValue(value(tupleCountStore, firstAnnoId1).readConnections, getValue(itTuple), Generous());
352 										appendValue(value(tupleCountStore, firstAnnoId1).readConnectionCounts, 1, Generous());
353 									}
354 								}
355 							}
356 							// matePairConnections:
357 							if (!empty(secTupleSet))
358 							{
359 								itSecTuple = begin(secTupleSet);
360 								itSecTupleEnd = end(secTupleSet);
361 								for ( ; itSecTuple != itSecTupleEnd; goNext(itSecTuple) )
362 								{
363 									matePairTuple = getValue(itTuple);
364 									// INVALID_ID: sign for connection by matepair (apart from that, there are no INVALID_IDs in the list)
365 									appendValue(matePairTuple, INVALID_ANNO_ID, Generous());
366 									if (!empty(getValue(itTuple)) && back(getValue(itTuple)) == front(getValue(itSecTuple)) )		// no id 2x allowed
367 									{
368 										if (exact_nTuple == 0 && n == 0) erase(value(itSecTuple), 0);
369 										else continue;							// tupel would be created double or tupel wouldn't have the length n anymore
370 									}
371 									append(matePairTuple, getValue(itSecTuple), Generous());
372 
373 									if (empty(getValue(itTuple)))
374 									{
375 										beginPos1 = getValue(me.annotationStore, firstAnnoId1).beginPos;
376 										endPos1 = getValue(me.annotationStore, firstAnnoId1).endPos;
377 									}
378 									else
379 									{
380 										beginPos1 = getValue(me.annotationStore, back(getValue(itTuple))).beginPos;
381 										endPos1 = getValue(me.annotationStore, back(getValue(itTuple))).endPos;
382 									}
383 									// begin position of first annotation in tuple of second read
384 									beginPos2 = getValue(me.annotationStore, front(getValue(itSecTuple))).beginPos;
385 									endPos2 = getValue(me.annotationStore, front(getValue(itSecTuple))).endPos;
386 									if ( (beginPos1 <= endPos1 && endPos1 < beginPos2) ||			// no overlapping annotations allowed
387 									     (endPos1 < beginPos1 && beginPos1 < endPos2) )
388 									{
389 										if (searchValue(pos, matePairTuple, getValue(tupleCountStore, firstAnnoId1).matePairConnections))
390 											++value(value(tupleCountStore, firstAnnoId1).matePairConnectionCounts, pos);
391 										else
392 										{
393 											if (pos != endPosition(getValue(tupleCountStore, firstAnnoId1).matePairConnections) )
394 											{
395 												resizeSpace(value(tupleCountStore, firstAnnoId1).matePairConnections, 1, pos, pos, Generous());
396 												assignValue(value(tupleCountStore, firstAnnoId1).matePairConnections, pos, matePairTuple);
397 												insertValue(value(tupleCountStore, firstAnnoId1).matePairConnectionCounts, pos, 1, Generous());
398 											}
399 											else
400 											{
401 												appendValue(value(tupleCountStore, firstAnnoId1).matePairConnections, matePairTuple, Generous());
402 												appendValue(value(tupleCountStore, firstAnnoId1).matePairConnectionCounts, 1, Generous());
403 											}
404 										}
405 									}
406 								}
407 							}
408 						}
409 					}
410 				}
411 			}
412 		}
413 	}
414 }
415 
416 
417 //////////////////////////////////////////////////////////////////////////////
418 ////// buildAnnoCountStoreg++  -I../seqan/projects/library/ -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -O3 -pedantic -lrt  main.cpp   -o main
419 //////////////////////////////////////////////////////////////////////////////
420 template<typename TAnnoCountStore, typename TSpec, typename TConfig, typename TReadAnnoStore>
421 inline void
buildAnnoCountStore(TAnnoCountStore & annoCountStore,FragmentStore<TSpec,TConfig> & me,TReadAnnoStore & readAnnoStore)422 buildAnnoCountStore(TAnnoCountStore & annoCountStore, FragmentStore<TSpec, TConfig> & me, TReadAnnoStore & readAnnoStore)
423 {
424 	typedef typename Iterator<TReadAnnoStore>::Type 			TReadIter;
425 	typedef typename FragmentStore<TSpec, TConfig>::TReadStore 		TReadStore;
426 	typedef typename Value<TReadStore>::Type 				TReadStoreElement;
427 	typedef typename Value<TReadAnnoStore>::Type				TReadAnnoStoreElement;
428 	typedef typename TReadAnnoStoreElement::TAnnoIds			TAnnoIds;
429 	typedef typename Iterator<TAnnoIds>::Type				TAnnoIdsIter;
430 	typedef typename Value<TAnnoIds>::Type					TIds;
431 	typedef typename Iterator<TIds>::Type 					TIdsIter;
432 	typedef typename Value<TIds>::Type					TId;
433 	typedef typename FragmentStore<TSpec, TConfig>::TAnnotationStore 	TAnnotationStore;
434 	typedef typename Value<TAnnotationStore>::Type 				TAnnotationStoreElement;
435 
436 	static const TId INVALID_READ_ID = TReadStoreElement::INVALID_ID;
437 	static const TId INVALID_ANNO_ID = TAnnotationStoreElement::INVALID_ID;
438 
439 	resize(annoCountStore, length(me.annotationStore), 0);
440 
441 	TReadIter itRead = begin(readAnnoStore);
442 	TReadIter itReadEnd = end(readAnnoStore);
443 	TId readId;
444 	TId matePairId;
445 	TId secReadId;
446 	TIds interSecIds;
447 	TIdsIter itP;
448 	TIdsIter itPEnd;
449 	TAnnoIdsIter itAnnoIds;
450 	TAnnoIdsIter itAnnoIdsEnd;
451 	TIdsIter itId;
452 	TIdsIter itIdEnd;
453 
454 	// increment for each read respective to its mapped ids the count in the annoCountStore
455 	for ( ; itRead != itReadEnd; goNext(itRead))
456 	{
457 		if (!empty(getValue(itRead).annoIds) )
458 		{
459 			readId = position(itRead, readAnnoStore);
460 			matePairId = getValue(me.readStore, readId).matePairId;
461 			if (matePairId != INVALID_READ_ID)
462 			{
463 				if (getValue(getValue(me.matePairStore, matePairId).readId, 0) == readId)
464 					secReadId = getValue(getValue(me.matePairStore, matePairId).readId, 1);
465 				else
466 					secReadId = getValue(getValue(me.matePairStore, matePairId).readId, 0);
467 			}
468 			// for each parentId: we just want to count annotations, in which the read mapped
469 			if (!empty(getValue(itRead).parentIds))
470 			{
471 				itP = begin(getValue(itRead).parentIds);
472 				itPEnd = end(getValue(itRead).parentIds);
473 				for (; itP != itPEnd; goNext(itP) )
474 				{
475 					// check mate-read to prevent double counts
476 					if (matePairId != INVALID_READ_ID)
477 					{
478 						if (!isElement_unsorted(getValue(itP), getValue(readAnnoStore, secReadId).parentIds) )
479 							continue; // if matepair read doesn't map in same parentId: no count (go to next parentId)
480 
481 						// count annotations, which occur in both reads, shouldn't be increment for the read with the bigger readId
482 						if (secReadId < readId )
483 						{
484 							clear(interSecIds);
485 							// just check the periphery annotations
486 							// if the end of the current read mapped in a same annotation as the start of the second read:
487 							if ( interSec(interSecIds, back(getValue(itRead).annoIds), front(getValue(readAnnoStore, secReadId).annoIds)) )
488 							{
489 								for (unsigned i = 0; i < length(interSecIds); ++i)
490 								{
491 									if (getValue(me.annotationStore, getValue(interSecIds, i) ).parentId ==  getValue(itP))
492 										--value(annoCountStore, getValue(interSecIds, i));
493 									// decrement the corresponding count
494 								}
495 							}
496 							// or if the start of the current read mapped in a same annotation as the end of the second read:
497 							else if ( interSec(interSecIds, front(getValue(itRead).annoIds),
498 								  back(getValue(readAnnoStore, secReadId).annoIds)) )
499 							{
500 								for (unsigned i = 0; i < length(interSecIds); ++i)
501 								{
502 									if (getValue(me.annotationStore, getValue(interSecIds, i) ).parentId ==  getValue(itP))
503 										--value(annoCountStore, getValue(interSecIds, i));
504 								}
505 							}
506 							if (getValue(itP) != INVALID_ANNO_ID) --value(annoCountStore, getValue(itP));
507 						}
508 					}
509 
510 					// count for all annoIds
511 					itAnnoIds = begin(getValue(itRead).annoIds);
512 					itAnnoIdsEnd = end(getValue(itRead).annoIds);
513 					for ( ; itAnnoIds != itAnnoIdsEnd; goNext(itAnnoIds))
514 					{
515 						itId = begin(getValue(itAnnoIds));
516 						itIdEnd = end(getValue(itAnnoIds));
517 						for ( ; itId != itIdEnd; goNext(itId))
518 							if (getValue(itId) != INVALID_ANNO_ID && getValue(me.annotationStore, getValue(itId)).parentId == getValue(itP) )
519 								++value(annoCountStore, getValue(itId));
520 					}
521 
522 					// count for parentIds (already selected)
523 					if (getValue(itP) != INVALID_ANNO_ID) ++value(annoCountStore, getValue(itP));
524 				}
525 			}
526 		}
527 	}
528 }
529 
530 
531 //////////////////////////////////////////////////////////////////////////////
532 ////// Overlap Module
533 //////////////////////////////////////////////////////////////////////////////
534 template<typename TReadAnnoStore, typename TAnnoCountStore, typename TTupleCountStore, typename TSpec, typename TConfig>
535 inline void
getResults(TReadAnnoStore & readAnnoStore,TAnnoCountStore & annoCountStore,TTupleCountStore & tupleCountStore,FragmentStore<TSpec,TConfig> & me,unsigned tupelSize,bool exact_nTuple,unsigned offsetInterval,unsigned thresholdGaps,bool unknownO)536 getResults(TReadAnnoStore & readAnnoStore,
537 	   TAnnoCountStore & annoCountStore,
538 	   TTupleCountStore & tupleCountStore,
539 	   FragmentStore<TSpec, TConfig> & me,
540 	   unsigned tupelSize,
541 	   bool exact_nTuple,
542 	   unsigned offsetInterval,
543 	   unsigned thresholdGaps,
544 	   bool unknownO)
545 {
546 	typedef typename FragmentStore<TSpec, TConfig>::TAnnotationStore 	TAnnotationStore;
547 	typedef typename Value<TAnnotationStore>::Type 				TAnnotationStoreElement;
548 	typedef typename TAnnotationStoreElement::TId 				TId;
549 	typedef typename FragmentStore<TSpec, TConfig>::TIntervalTreeStore 	TIntervalTreeStore;
550 	typedef typename Iterator<TIntervalTreeStore>::Type			TIntervalTree;
551 	typedef typename Value<TReadAnnoStore>::Type 				TReadAnnoStoreElement;
552 	typedef typename TReadAnnoStoreElement::TAnnoIds 			TAnnoIds;
553 
554 	typedef typename FragmentStore<TSpec, TConfig>::TAlignedReadStore	TAlignedReadStore;
555 	typedef typename Position<TAlignedReadStore>::Type 			TAlignPos;
556 	typedef 	 String<AlignIntervalsStoreElement<> > 			TAlignIntervalsStore;
557 	typedef typename Iterator<TAlignIntervalsStore>::Type 			TAlignIntervalsStoreIter;
558 
559 	resize(readAnnoStore, length(me.readStore));
560 
561 	TIntervalTree intervalTree;
562 
563 	// extract intervals from alignedReadStore and store them in AlignIntervalsStore:
564 	TAlignIntervalsStore alignIntervalsStore;
565 	buildAlignIntervalsStore(alignIntervalsStore, me, thresholdGaps);
566 
567 	if (!empty(alignIntervalsStore))
568 	{
569 		TAlignPos alignPos;
570 		TId contigId;
571 		TId readId;
572 		TAnnoIds  ids;
573 
574 		TAlignIntervalsStoreIter it = begin(alignIntervalsStore);
575 		TAlignIntervalsStoreIter itEnd = end(alignIntervalsStore);
576 		// for each item in alignIntervalsStore:
577 		for ( ; it != itEnd; goNext(it))
578 		{
579 			// get ids from alignedReadStore (same position as in alignIntervalsStore):
580 			alignPos = position(it, alignIntervalsStore);
581 			contigId = getValue(me.alignedReadStore, alignPos).contigId;
582 			readId = getValue(me.alignedReadStore, alignPos).readId;
583 			// get respective intervalTree
584 			if (unknownO ||  getValue(me.alignedReadStore, alignPos).beginPos <= getValue(me.alignedReadStore, alignPos).endPos)
585 				intervalTree = begin(me.intervalTreeStore_F, Standard()) + contigId; 	//getValue(me.intervalTreeStore_F, contigId);
586 			else
587 				intervalTree = begin(me.intervalTreeStore_R, Standard()) + contigId;        //getValue(me.intervalTreeStore_R, contigId);
588 
589 			// get annotationStore-Ids for these intervals:
590 			clear(ids);
591 			if ((*intervalTree).interval_counter != 0)
592 				getIdsForRead(ids, me, *intervalTree, getValue(it).intervals, offsetInterval);
593 			// assign Ids from mapped annotations to readAnnoStore:
594 			value(readAnnoStore, readId).contigId = contigId;
595 			assignToReadAnnoStore(readAnnoStore, me, readId, ids);
596 		}
597 	}
598 	buildAnnoCountStore(annoCountStore, me, readAnnoStore);
599 	buildTupleCountStore(tupleCountStore, me, readAnnoStore, tupelSize, exact_nTuple);
600 }
601 
602 
603 //////////////////////////////////////////////////////////////////////////////
604 /// get normalized values for annotations And get Map for Gene orientations (necessary for annotation Output)
605 //////////////////////////////////////////////////////////////////////////////
606 template<typename TAnnoNormStore, typename TMapO, typename TAnnoCountStore, typename TSpec, typename TConfig>
607 inline void
normalizeAnnoCounts(TAnnoNormStore & annoNormStore,TMapO & mapO,TAnnoCountStore & annoCountStore,FragmentStore<TSpec,TConfig> & me)608 normalizeAnnoCounts(TAnnoNormStore &annoNormStore, TMapO &mapO, TAnnoCountStore &annoCountStore, FragmentStore<TSpec, TConfig> &me)
609 {
610 	typedef typename FragmentStore<TSpec, TConfig>::TAnnotationStore 	TAnnotationStore;
611 	typedef typename Value<TAnnotationStore>::Type				TAnnotationStoreElement;
612 	typedef typename TAnnotationStoreElement::TId				TId;
613 	typedef typename FragmentStore<TSpec, TConfig>::TContigPos		TPos;
614 	typedef typename FragmentStore<TSpec, TConfig>::TReadStore		TReadStore;
615 	typedef typename Size<TReadStore>::Type					TReadStoreSize;
616 	typedef typename Iterator<TAnnotationStore>::Type			TAnnoIter;
617 	typedef typename Iterator<TAnnoCountStore>::Type 			TCountIter;
618 	typedef typename Iterator<TAnnoNormStore>::Type				TNormIter;
619 	typedef typename Size<TPos>::Type					TSize;
620 	typedef 	 String<TSize>						TChildrenLengths;
621 	typedef typename Iterator<String<TSize> >::Type				TLengthIter;
622 	typedef 	 Pair<TId, TChildrenLengths>				TPair;
623 	typedef typename Value<TMapO>::Type					TPairO;
624 	typedef 	 Map<TPair>						TMap;
625 	typedef typename Iterator<TMap>::Type					TMapIter;
626 
627 	static const TId INVALID_ID = TAnnotationStoreElement::INVALID_ID;
628 	static const TPos INVALID_POS = TAnnotationStoreElement::INVALID_POS;
629 
630 	resize(annoNormStore, length(annoCountStore), 0);
631 
632 	TReadStoreSize readNo = length(me.readStore) - length(me.matePairStore);
633 
634 	TMap map;
635 	clear(map);
636 	clear(mapO);
637 
638 	if(!empty(me.annotationStore))
639 	{
640 		TAnnoIter itA = begin(me.annotationStore);
641 		TAnnoIter itAEnd = end(me.annotationStore);
642 		TCountIter itC = begin(annoCountStore);
643 		TNormIter itN = begin(annoNormStore);
644 		TChildrenLengths childrenLengths;
645 		TPair pair;
646 		TPairO pairO;
647 		TSize length;
648 
649 		for ( ; itA != itAEnd; goNext(itA), goNext(itC), goNext(itN) )
650 		{
651 			if (getValue(itA).beginPos == INVALID_POS && getValue(itA).parentId == INVALID_ID) 	// make entry for each gene/parent in map:
652 			{
653 				clear(childrenLengths);
654 				pair.i1 = position(itA, me.annotationStore);
655 				pair.i2 = childrenLengths;
656 				insert(map, pair);								// for lengths
657 
658 				pairO.i1 = position(itA, me.annotationStore);
659 				pairO.i2 = 0;
660 				insert(mapO, pairO);								// for orientation
661 			}
662 			else if (getValue(itA).beginPos != INVALID_POS)						// for each exon/child:
663 			{
664 				if (getValue(itA).beginPos <= getValue(itA).endPos)
665 					length = getValue(itA).endPos - getValue(itA).beginPos + 1;
666 				else
667 					length = getValue(itA).beginPos - getValue(itA).endPos + 1;
668 
669 				value(itN) = ((double)1000000000 * (double)getValue(itC))/((double)readNo * (double)length);		// calculate normalized expression-value
670 
671 				if (getValue(itA).parentId != INVALID_ID)					// append length to gene/parent lengths
672 				{
673 					appendValue(mapValue(map, getValue(itA).parentId), length, Generous());
674 					if (getValue(itA).beginPos > getValue(itA).endPos)
675 						mapValue(mapO, getValue(itA).parentId) = 1;
676 				}
677 			}
678 		}
679 	}
680 
681 	if (!empty(map))
682 	{
683 		TMapIter itM = begin(map);
684 		TMapIter itMEnd = end(map);
685 		TSize length;
686 		TLengthIter itL;
687 		TLengthIter itLEnd;
688 		for ( ; itM != itMEnd; goNext(itM))	// calculate normalized gene/parent expression-values
689 		{
690 			length = 0;
691 			itL = begin(value(itM).i2);
692 			itLEnd = end(value(itM).i2);
693 			for ( ; itL != itLEnd; goNext(itL))
694 				length += getValue(itL);
695 			value(annoNormStore, value(itM).i1) = ((double)1000000000 * (double)getValue(annoCountStore, value(itM).i1) )/((double)readNo * (double)length);
696 		}
697 	}
698 }
699 
700 
701 //////////////////////////////////////////////////////////////////////////////
702 /// get normalized values for tuple
703 //////////////////////////////////////////////////////////////////////////////
704 template<typename TTupleCountStore, typename TSpec, typename TConfig>
705 inline void
normalizeTupleCounts(TTupleCountStore & tupleCountStore,FragmentStore<TSpec,TConfig> & me)706 normalizeTupleCounts(TTupleCountStore &tupleCountStore, FragmentStore<TSpec, TConfig> &me)
707 {
708 	typedef typename FragmentStore<TSpec, TConfig>::TAnnotationStore 	TAnnotationStore;
709 	typedef typename Value<TAnnotationStore>::Type				TAnnotationStoreElement;
710 	typedef typename TAnnotationStoreElement::TId				TId;
711 	typedef typename Value<TTupleCountStore>::Type				TTupleCountStoreElement;
712 	typedef typename TTupleCountStoreElement::TTupleList			TTupleList;
713 	typedef typename TTupleCountStoreElement::TTupleCounts			TTupleCounts;
714 	typedef typename TTupleCountStoreElement::TTupleNorm			TTupleNorm;
715 	typedef typename TTupleCountStoreElement::TTuple			TTuple;
716 	typedef typename Iterator<TTupleCountStore>::Type 			TStoreIter;
717 	typedef typename Iterator<TTupleList>::Type				TTupleListIter;
718 	typedef typename Iterator<TTupleCounts>::Type				TCountIter;
719 	typedef typename Iterator<TTupleNorm>::Type				TNormIter;
720 	typedef typename Iterator<TTuple>::Type					TTupleIter;
721 	typedef typename FragmentStore<TSpec, TConfig>::TContigPos		TPos;
722 	typedef typename Size<TPos>::Type					TSize;
723 	typedef typename FragmentStore<TSpec, TConfig>::TReadStore		TReadStore;
724 	typedef typename Size<TReadStore>::Type					TReadStoreSize;
725 
726 	static const TId INVALID_ID = TAnnotationStoreElement::INVALID_ID;
727 
728 	TReadStoreSize readNo = length(me.readStore) - length(me.matePairStore);
729 
730 	if (!empty(tupleCountStore))
731 	{
732 		TStoreIter itS = begin(tupleCountStore);
733 		TStoreIter itSEnd = end(tupleCountStore);
734 		TTupleListIter itT;
735 		TTupleListIter itTEnd;
736 		TCountIter itC;
737 		TNormIter itN;
738 		TSize tupleLength;
739 		TTupleIter itId;
740 		TTupleIter itIdEnd;
741 		for ( ; itS != itSEnd; goNext(itS))
742 		{
743 			// readConnections:
744 			resize(value(itS).readConnectionNorm, length(getValue(itS).readConnections));
745 			if (!empty(getValue(itS).readConnections))
746 			{
747 				itT = begin(getValue(itS).readConnections);
748 				itTEnd = end(getValue(itS).readConnections);
749 				itC = begin(getValue(itS).readConnectionCounts);
750 				itN = begin(getValue(itS).readConnectionNorm);
751 				for ( ; itT != itTEnd; goNext(itT), goNext(itC), goNext(itN))
752 				{
753 					tupleLength = 0;
754 					itId = begin(getValue(itT));
755 					itIdEnd = end(getValue(itT));
756 					for ( ; itId != itIdEnd; goNext(itId))
757 					{
758 						if (getValue(me.annotationStore, getValue(itId)).beginPos <= getValue(me.annotationStore, getValue(itId)).endPos)
759 							tupleLength += getValue(me.annotationStore, getValue(itId)).endPos - getValue(me.annotationStore, getValue(itId)).beginPos + 1;
760 						else
761 							tupleLength += getValue(me.annotationStore, getValue(itId)).beginPos - getValue(me.annotationStore, getValue(itId)).endPos + 1;
762 					}
763 					value(itN) = ((double)1000000000 * (double)getValue(itC)) / ((double)readNo * (double)tupleLength);
764 				}
765 			}
766 			// matePairConnections:
767 			resize(value(itS).matePairConnectionNorm, length(getValue(itS).matePairConnections));
768 			if (!empty(getValue(itS).matePairConnections))
769 			{
770 				itT = begin(getValue(itS).matePairConnections);
771 				itTEnd = end(getValue(itS).matePairConnections);
772 				itC = begin(getValue(itS).matePairConnectionCounts);
773 				itN = begin(getValue(itS).matePairConnectionNorm);
774 				for ( ; itT != itTEnd; goNext(itT), goNext(itC), goNext(itN))
775 				{
776 					tupleLength = 0;
777 					itId = begin(getValue(itT));
778 					itIdEnd = end(getValue(itT));
779 					for ( ; itId != itIdEnd; goNext(itId))
780 					{
781 						if (getValue(itId) != INVALID_ID)
782 						{
783 							if (getValue(me.annotationStore, getValue(itId)).beginPos <= getValue(me.annotationStore, getValue(itId)).endPos)
784 								tupleLength += getValue(me.annotationStore, getValue(itId)).endPos - getValue(me.annotationStore, getValue(itId)).beginPos + 1;
785 							else
786 								tupleLength += getValue(me.annotationStore, getValue(itId)).beginPos - getValue(me.annotationStore, getValue(itId)).endPos + 1;
787 						}
788 					}
789 					value(itN) = ((double)1000000000 * (double)getValue(itC)) / ((double)readNo * (double)tupleLength);
790 				}
791 			}
792 		}
793 	}
794 }
795 
796 
797 //////////////////////////////////////////////////////////////////////////////
798 /// NGS Overlapper main function
799 //////////////////////////////////////////////////////////////////////////////
800 
801 inline bool
ngsOverlapper(CharString const & nameSAM,CharString const & nameGFF,CharString const & outputPath,unsigned tupleValue,bool exact_nTuple,unsigned thresholdGaps,unsigned offsetInterval,unsigned thresholdCount,double thresholdRPKM,bool unknownO,bool fusion,bool gtf)802 ngsOverlapper(CharString const &nameSAM, CharString const &nameGFF, CharString const &outputPath,
803 	      unsigned tupleValue, bool exact_nTuple,
804 	      unsigned thresholdGaps, unsigned offsetInterval,
805 	      unsigned thresholdCount, double thresholdRPKM,
806 	      bool unknownO,
807 	      bool fusion,
808 	      bool gtf)
809 {
810 	FragmentStore<> me;
811 #ifdef DEBUG_OVERLAP_MODULE
812 	SEQAN_PROTIMESTART(find1_time);
813 #endif
814 	// build contigStore from FASTA file
815 #ifdef DEBUG_OVERLAP_MODULE
816 	std::cout << "load Sam..." << std::endl;
817 #endif
818 	// read aligned reads in FragmentStore from Sam files
819 	std::fstream fileSAM;
820 	fileSAM.open(toCString(nameSAM), std::ios_base::in | std::ios_base::binary);
821 	if(!fileSAM.is_open()) return false;
822 	read(fileSAM, me, Sam());
823 	fileSAM.close();
824 
825 	// readAnnotations from Gff or Gtf:
826 	if (gtf == 0)
827 	{
828 #ifdef DEBUG_OVERLAP_MODULE
829 		SEQAN_PROTIMESTART(find2_time);
830 		std::cout << "load Gff..." << std::endl;
831 #endif
832 		readAnnotationsFromGFF(me, toCString(nameGFF));
833 	}
834 	else
835 	{
836 #ifdef DEBUG_OVERLAP_MODULE
837 		SEQAN_PROTIMESTART(find2_time);
838 		std::cout << "load Gff..." << std::endl;
839 #endif
840 		readAnnotationsFromGTF(me, toCString(nameGFF));
841 	}
842 
843 	// create IntervalTreeStore:
844 #ifdef DEBUG_OVERLAP_MODULE
845 	SEQAN_PROTIMESTART(find3_time);
846 #endif
847 	createIntervalTreeStore(me, unknownO);
848 #ifdef DEBUG_OVERLAP_MODULE
849 	std::cout << "create intervalTreeStores from annotationStore took: \t" << SEQAN_PROTIMEDIFF(find3_time) << " seconds" << std::endl;
850 #endif
851 
852 	// build stores for results:
853 	String<ReadAnnoStoreElement<unsigned> >			readAnnoStore;
854 	String<unsigned> 					annoCountStore;
855 	String<TupleCountStoreElement<unsigned> >		tupleCountStore;
856 	String<TupleCountStoreElement_Fusion<unsigned> >	tupleCountStore_Fusion;			// additional Store, if fusion genes should be checked
857 
858 
859 
860 	// get results with additional check for transfusion genes (will be changed later additionally)
861 	if (fusion == 1)
862 		getResults_Fusion(readAnnoStore, annoCountStore, tupleCountStore, tupleCountStore_Fusion, me, tupleValue, exact_nTuple, offsetInterval, thresholdGaps, unknownO);
863 	else // get normal results:
864 		getResults(readAnnoStore, annoCountStore, tupleCountStore, me, tupleValue, exact_nTuple, offsetInterval, thresholdGaps, unknownO);
865 
866 
867 	// normalize:
868 	String<double>			annoNormStore;
869 	Map<Pair<unsigned, bool> > 	mapO;
870 	normalizeAnnoCounts(annoNormStore, mapO, annoCountStore, me);
871 	normalizeTupleCounts(tupleCountStore, me);
872 	if (fusion == 1)
873 		normalizeTupleCounts_Fusion(tupleCountStore_Fusion, me);
874 
875 
876 	// output:
877 	CharString pathReadOutput = outputPath;
878 	append(pathReadOutput, "readOutput.gff");
879 	std::fstream readOutput;
880 	readOutput.open(toCString(pathReadOutput), std::ios_base::out | std::ios_base::trunc | std::ios_base::binary);
881 	createReadCountGFF(readOutput, readAnnoStore, me);
882 	readOutput.close();
883 
884 	CharString pathAnnoOutput = outputPath;
885 	append(pathAnnoOutput, "annoOutput.gff");
886 	std::fstream annoOutput;
887 	annoOutput.open(toCString(pathAnnoOutput), std::ios_base::out | std::ios_base::trunc | std::ios_base::binary);
888 	createAnnoCountGFF(annoOutput, annoCountStore, annoNormStore, me, mapO);
889 	annoOutput.close();
890 
891 	CharString pathTupleOutput = outputPath;
892 	append(pathTupleOutput, "tupleOutput.gff");
893 	std::fstream tupleOutput;
894 	tupleOutput.open(toCString(pathTupleOutput), std::ios_base::out | std::ios_base::trunc | std::ios_base::binary);
895 	createTupleCountGFF(tupleOutput, tupleCountStore, me, thresholdCount, thresholdRPKM);
896 	tupleOutput.close();
897 
898 	// additional output, if fusion genes were checked
899 	if (fusion == 1)
900 	{
901 		CharString pathTupleOutput_Fusion = outputPath;
902 		append(pathTupleOutput_Fusion, "tupleOutput_Fusion.gff");
903 		std::fstream tupleOutput_Fusion;
904 		tupleOutput_Fusion.open(toCString(pathTupleOutput_Fusion), std::ios_base::out | std::ios_base::trunc | std::ios_base::binary);
905 		createTupleCountGFF_Fusion(tupleOutput_Fusion, tupleCountStore_Fusion, me, thresholdCount, thresholdRPKM);
906 		tupleOutput_Fusion.close();
907 	}
908 
909 
910 #ifdef DEBUG_OVERLAP_MODULE
911 	std::cout << "ngsOverlapper-function took: \t" << SEQAN_PROTIMEDIFF(find1_time) << " seconds" << std::endl;
912 	std::cout << "ngsOverlapper-function without reading Sam took: \t" << SEQAN_PROTIMEDIFF(find2_time) << " seconds" << std::endl;
913 	std::cout << "ngsOverlapper-function and create IntervalTreeStore without reading Sam took:\t" << SEQAN_PROTIMEDIFF(find2_time) - SEQAN_PROTIMEDIFF(find3_time) << " seconds" << std::endl;
914 #endif
915 	return true;
916 }
917 
918 //////////////////////////////////////////////////////////////////////////////
919 
920 }// namespace SEQAN_NAMESPACE_MAIN
921 
922 #endif //#ifndef SEQAN_HEADER_...
923