1 #ifndef RAZERS_PAIRED_MATCH_FILTER_H_
2 #define RAZERS_PAIRED_MATCH_FILTER_H_
3 
4 #include <seqan/graph_types/graph_idmanager.h>
5 
6 #include "razers.h"
7 
8 namespace seqan {
9 
10 template <typename TThreadLocalStorage>
11 class FilterPatternLSetMaxErrorsWrapper;
12 template <typename TThreadLocalStorage>
13 class FilterPatternRSetMaxErrorsWrapper;
14 
15 // ============================================================================
16 // Forwards
17 // ============================================================================
18 
19 // ============================================================================
20 // Tags, Classes, Enums
21 // ============================================================================
22 
23 template <typename TOptionsSpec, typename TReadSeqSet, typename TCallback>
24 class PairedMatchFilter
25 {
26 public:
27     // Number of reads.
28     unsigned readCount;
29     // If a read has more than matchThreshold matches then it gets a histogram.
30     unsigned matchThreshold;
31     // Fraction of reads expect to have histograms for.
32     double frac;
33     // Count matches for each read
34     String<unsigned> hitCount;
35     // Map from read number to histogram id.
36     std::unordered_map<unsigned, unsigned> pairIdToHistogramId;
37     // Id manager for histogram.
38     IdManager<unsigned> idManager;
39     // Histograms.
40     String<String<unsigned> > histograms;
41     // Ids of the reads that are purged.
42     String<unsigned> purgedPairIds;
43     // The callback context object.
44     Holder<TCallback> callback;
45     // Read ID offset.
46     unsigned readOffset;
47     // The read sequences.
48     TReadSeqSet const & readSeqs;
49     // The options object.
50     RazerSCoreOptions<TOptionsSpec> const & options;
51 
PairedMatchFilter(unsigned readCount_,unsigned matchThreshold_,double frac_,TCallback & callback_,unsigned readOffset_,TReadSeqSet const & readSeqs_,RazerSCoreOptions<TOptionsSpec> const & options_)52     PairedMatchFilter(unsigned readCount_, unsigned matchThreshold_, double frac_, TCallback & callback_, unsigned readOffset_, TReadSeqSet const & readSeqs_, RazerSCoreOptions<TOptionsSpec> const & options_) :
53         readCount(readCount_), matchThreshold(matchThreshold_), frac(frac_), callback(callback_), readOffset(readOffset_), readSeqs(readSeqs_), options(options_)
54     {
55         resize(hitCount, readCount, 0, Exact());
56         reserve(histograms, unsigned(frac * readCount));
57     }
58 
59 };
60 
61 // ============================================================================
62 // Metafunctions
63 // ============================================================================
64 
65 // ============================================================================
66 // Functions
67 // ============================================================================
68 
69 template <typename TOptionsSpec, typename TReadSeqSet, typename TCallback>
70 inline unsigned
_createHistogram(PairedMatchFilter<TOptionsSpec,TReadSeqSet,TCallback> & filter,unsigned pairId)71 _createHistogram(PairedMatchFilter<TOptionsSpec, TReadSeqSet, TCallback> & filter, unsigned pairId)
72 {
73     unsigned result = obtainId(filter.idManager);
74     if (result >= length(filter.histograms))
75         resize(filter.histograms, length(filter.histograms) + 1);
76     resize(filter.histograms[result], 1 + (int)(filter.options.errorRate * (length(filter.readSeqs[2 * pairId]) + length(filter.readSeqs[2 * pairId + 1]))), 0, Exact());
77     SEQAN_ASSERT_LT(result, length(filter.histograms));
78     return result;
79 }
80 
81 template <typename TOptionsSpec, typename TReadSeqSet, typename TCallback>
82 inline void
registerRead(PairedMatchFilter<TOptionsSpec,TReadSeqSet,TCallback> & filter,unsigned pairId,int score)83 registerRead(PairedMatchFilter<TOptionsSpec, TReadSeqSet, TCallback> & filter, unsigned pairId, int score)
84 {
85     // std::cerr << "registering read " << pairId << std::endl;
86     if (filter.hitCount[pairId - filter.readOffset] == std::numeric_limits<unsigned>::max())
87         return;
88 
89     if (++filter.hitCount[pairId - filter.readOffset] < filter.matchThreshold)
90         return;
91 
92     // TODO(holtgrew): Maybe global read to histogram map; faster?
93 
94     // Get histogram id, insert new histogram if necessary, exit if no histogram yet.
95     unsigned histogramId = 0;
96     if (filter.hitCount[pairId - filter.readOffset] == filter.matchThreshold)
97     {
98         // std::cerr << "new histogram for read " << pairId << std::endl;
99         histogramId = _createHistogram(filter, pairId);
100         filter.pairIdToHistogramId[pairId] = histogramId;
101     }
102     else
103     {
104         // std::cerr << "updating histogram for read " << pairId << std::endl;
105         typedef typename std::unordered_map<unsigned, unsigned>::iterator TIterator;
106         TIterator it = filter.pairIdToHistogramId.find(pairId);
107         SEQAN_ASSERT(it != filter.pairIdToHistogramId.end());
108         histogramId = it->second;
109     }
110 
111     // Insert value into histogram.
112     _incrementCount(filter, histogramId, score);
113 }
114 
115 template <typename TOptionsSpec, typename TReadSeqSet, typename TCallback>
116 inline bool
processRead(PairedMatchFilter<TOptionsSpec,TReadSeqSet,TCallback> & filter,unsigned pairId)117 processRead(PairedMatchFilter<TOptionsSpec, TReadSeqSet, TCallback> & filter, unsigned pairId)
118 {
119     typedef typename std::unordered_map<unsigned, unsigned>::iterator TIterator;
120 
121     if (filter.hitCount[pairId - filter.readOffset] < filter.matchThreshold)
122         return false;
123 
124     // std::cerr << "processing read " << pairId << std::endl;
125     // Get histogram id, insert new histogram if necessary, exit if no histogram yet.
126     TIterator it = filter.pairIdToHistogramId.find(pairId);
127     if (it == filter.pairIdToHistogramId.end())
128         return false;  // Must have been disabled before.
129 
130     unsigned histogramId = it->second;
131 
132     // Perform actions.
133     int newLimit;
134 
135     if (filter.options.scoreDistanceRange == 0)
136     {
137         newLimit = _newLimit(filter, histogramId);
138         if (newLimit == NO_NEW_LIMIT)
139             return false;
140 
141         if (filter.options.purgeAmbiguous)
142         {
143 //            std::cerr << "PURGED " << pairId << std::endl;
144             appendValue(filter.purgedPairIds, pairId);
145             newLimit = 0;
146         }
147     }
148     else
149     {
150         newLimit = _newLimitDistRange(filter, histogramId);
151         if (newLimit == CAN_BE_PURGED)
152         {
153             // std::cerr << "PURGED " << pairId << std::endl;
154             appendValue(filter.purgedPairIds, pairId);
155             newLimit = 0;
156         }
157     }
158 
159 //    std::cerr << "LIMITING " << pairId << "\t" << filter.histograms[histogramId][0] << "\t" << filter.hitCount[pairId - filter.readOffset] << "\t" << newLimit << std::endl;
160     FilterPatternLSetMaxErrorsWrapper<TCallback> wrapperL(value(filter.callback));
161     FilterPatternRSetMaxErrorsWrapper<TCallback> wrapperR(value(filter.callback));
162     setMaxErrors(wrapperL, pairId, newLimit - 1);
163     setMaxErrors(wrapperR, pairId, newLimit - 1);
164     if (filter.options.errorCutOff[2 * pairId] > newLimit)
165         filter.options.errorCutOff[2 * pairId] = newLimit;
166     if (filter.options.errorCutOff[2 * pairId + 1] > newLimit)
167         filter.options.errorCutOff[2 * pairId + 1] = newLimit;
168 
169     if (newLimit == 0)
170     {
171         _freeHistogram(filter, histogramId);
172         filter.pairIdToHistogramId.erase(pairId);
173         filter.hitCount[pairId - filter.readOffset] = std::numeric_limits<unsigned>::max();
174         return true;
175     }
176     return false;
177 }
178 
179 }  // namespace seqan
180 
181 #endif  // #ifndef RAZERS_PAIRED_MATCH_FILTER_H_
182