1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2015, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 // ==========================================================================
34 
35 // TODO(holtgrew): Allow the storage of additional fields.
36 /* This could look as follows:
37 
38    The values of the additional fields are stored as strings.  Either we store name/value pairs or, more efficiently,
39    store a stringset for each match, with the same number of entries.  An additional String/Map maps from field name to
40    index in this string sets.
41  */
42 
43 #ifndef INCLUDE_SEQAN_PARSE_LM_LOCAL_MATCH_STORE_H_
44 #define INCLUDE_SEQAN_PARSE_LM_LOCAL_MATCH_STORE_H_
45 
46 namespace seqan {
47 
48 // ============================================================================
49 // Forwards
50 // ============================================================================
51 
52 // ============================================================================
53 // Tags, Classes, Enums
54 // ============================================================================
55 
56 /*!
57  * @class LocalMatch
58  * @headerfile <seqan/parse_lm.h>
59  * @brief Stores information about local matches.
60  *
61  * @signature template <typename TId, typename TPosition>
62  *            class LocalMatch;
63  *
64  * @tparam TId       Type to use for subject/query id.
65  * @tparam TPosition Type to use for storing positions.
66  *
67  * Matches on the reverse-complement are encoded by the begin position being greater than the end position.
68  *
69  * Sequence names are not stored in LocalMatch objects but in the @link LocalMatchStore @endlink they belong to.
70  *
71  * @see LocalMatchStore
72  *
73  *
74  * @var TId LocalMatch::queryId
75  * @brief The id of the query.
76  *
77  * @var TPosition LocalMatch::queryBeginPos
78  * @brief Begin position of local match in the query.
79  *
80  * @var TPosition LocalMatch::queryEndPos
81  * @brief End position of local match in the query.
82  *
83  * @var TPosition LocalMatch::subjectBeginPos
84  * @brief Begin position of local match in the subject.
85  *
86  * @var TPosition LocalMatch::subjectEndPos
87  * @brief End position of local match in the subject.
88  *
89  * @var TId LocalMatch::subjectId
90  * @brief The id of the subject.
91  */
92 
93 template <typename TId, typename TPosition>
94 class LocalMatch
95 {
96 public:
97     TId id;
98     TId subjectId;
99     TPosition subjectBeginPos;
100     TPosition subjectEndPos;
101     TId queryId;
102     TPosition queryBeginPos;
103     TPosition queryEndPos;
104 
LocalMatch()105     LocalMatch() :
106             id(MaxValue<TId>::VALUE),
107             subjectId(MaxValue<TId>::VALUE),
108             subjectBeginPos(MaxValue<TPosition>::VALUE),
109             subjectEndPos(MaxValue<TPosition>::VALUE),
110             queryId(MaxValue<TId>::VALUE),
111             queryBeginPos(MaxValue<TPosition>::VALUE),
112             queryEndPos(MaxValue<TPosition>::VALUE)
113     {}
114 
LocalMatch(TId id_,TId subjectId_,TPosition subjectBeginPos_,TPosition subjectEndPos_,TId queryId_,TPosition queryBeginPos_,TPosition queryEndPos_)115     LocalMatch(TId id_,
116                TId subjectId_,
117                TPosition subjectBeginPos_,
118                TPosition subjectEndPos_,
119                TId queryId_,
120                TPosition queryBeginPos_,
121                TPosition queryEndPos_) :
122             id(id_),
123             subjectId(subjectId_),
124             subjectBeginPos(subjectBeginPos_),
125             subjectEndPos(subjectEndPos_),
126             queryId(queryId_),
127             queryBeginPos(queryBeginPos_),
128             queryEndPos(queryEndPos_)
129     {}
130 
131     bool operator==(LocalMatch const & other) const
132     {
133         return id == other.id && subjectId == other.subjectId && subjectBeginPos == other.subjectBeginPos &&
134                 subjectEndPos == other.subjectEndPos && queryId == other.queryId &&
135                 queryBeginPos == other.queryBeginPos && queryEndPos == other.queryEndPos;
136     }
137 };
138 
139 /*!
140  * @class LocalMatchStoreConfig
141  * @headerfile <seqan/parse_lm.h>
142  * @brief Stores information about local matches.
143  *
144  * @signature template <typename TSpec>
145  *            struct LocalMatchStoreConfig;
146  *
147  * @tparam TSpec Specializing type.
148  *
149  *
150  * @typedef LocalMatchStoreConfig::TId;
151  * @brief Type to use for ids.
152  * @signature typedef unsigned LocalMatchStoreConfig::TId;
153  *
154  * @typedef LocalMatchStoreConfig::TPosition;
155  * @brief The type to use for positions.
156  * @signature typedef (...) LocalMatchStoreConfig::TPosition;
157  */
158 
159 template <typename TSpec>
160 struct LocalMatchStoreConfig
161 {
162     typedef unsigned TId;
163     typedef typename Position<Dna5String>::Type TPosition;
164 };
165 
166 /*!
167  * @class LocalMatchStore
168  * @headerfile <seqan/parse_lm.h>
169  * @brief Stores information about local matches.
170  *
171  * @signature template <typename TSpec = void, TConfig = LocalMatchStoreConfig<TSpec> >
172  *            struct LocalMatchStore;
173  *
174  * @tparam TSpec   Specialization tag.
175  * @tparam TConfig Configuration class.
176  *
177  * The LocalMatchStore provides information about matches.  Similar to the @link FragmentStore @endlink, the
178  * information is split into multiple sub stores.  Each sub store stores a part of the information.
179  *
180  * The LocalMatchStore#matchStore stores the information about a match.  The LocalMatchStore#sequenceNameStore stores
181  * the sequence names.  These both sub stores are "core stores", they are filled with consistent information, i.e. for
182  * each sequence id in the matchStore, there has to be a valid entry in the sequenceNameStore.
183  *
184  * The LocalMatchStore#cigarStore optionally stores CIGAR strings for the matches.  Its entries are referenced by
185  * <tt>matchStore[i].id</tt>.
186  *
187  * @section Examples
188  *
189  * Read Lastz matches from a link RecordReader and then print them to stdout.
190  *
191  * @code{.cpp}
192  * // Read local alignments from record reader.  Note that in real-world code,
193  * // you should have error handling.
194  * LocalMatchStore<> lmStore;
195  * while (!atEnd(recordReader))
196  *     readRecord(lmStore, recordReader, StellarGff());
197  *
198  * // Print local alignment information to stdout.
199  * std::cout << "# Reverse strandness is indicated by end < begin\n"
200  *           << "#db\tdb_beg\tdb_end\tquery\tq_beg\tq_end\n";
201  * for (unsigned i = 0; i < length(lmStore.matchStore); ++i)
202  *     std::cout << lmStore.sequenceNameStore[lmStore.matchStore[i].queryId] << "\t"
203  *               << lmStore.matchStore[i].queryBeginPos << "\t"
204  *               << lmStore.matchStore[i].queryEndPos << "\t"
205  *               << lmStore.sequenceNameStore[lmStore.matchStore[i].subjectId] << "\t"
206  *               << lmStore.matchStore[i].subjectBeginPos << "\t"
207  *               << lmStore.matchStore[i].subjectEndPos << "\n";
208  * @endcode
209  * @see LocalMatch
210  *
211  * @var TMatchStore LocalMatchStore::matchStore
212  * @brief String storing the LocalMatch local matches.
213  *
214  * @var TStringSet LocalMatchStore::sequenceNameStore
215  * @brief StringSet storing the sequence names.
216  *
217  * @var TCigarString LocalMatchStore::cigarStore
218  * @brief String storing the CIGAR strings.
219  */
220 
221 /*!
222  * @fn LocalMatchStore#readRecord
223  *
224  * @headerfile <seqan/parse_lm.h>
225  *
226  * @brief Read Lastz "general" format record.
227  *
228  * @signature int readRecord(store, reader, tag);
229  *
230  * @param[in,out] store  LocalMatchStore object to read into.
231  * @param[in,out] stream SinglePassRecordReader to read from.
232  * @param[in]     tag    The tag for selecting the format, one of BlastnTabular, LastzGeneral, and StellarGff.
233  *
234  * @return int 0 on success, non-0 on errors and EOF
235  */
236 
237 template <typename TSpec=void, typename TConfig=LocalMatchStoreConfig<TSpec> >
238 class LocalMatchStore
239 {
240 public:
241     // ----------------------------------------------------------------------------
242     // Typedefs
243     // ----------------------------------------------------------------------------
244 
245     typedef typename TConfig::TId TId;
246     typedef typename TConfig::TPosition TPosition;
247 
248     typedef LocalMatch<TId, TPosition> TLocalMatch;
249     typedef String<TLocalMatch> TMatchStore;
250 
251     typedef String<CigarElement<> > TCigarString;
252     typedef String<TCigarString> TCigarStore;
253 
254     typedef StringSet<CharString> TNameStore;
255     typedef NameStoreCache<TNameStore> TNameStoreCache_;
256 
257     // ----------------------------------------------------------------------------
258     // Member Variables
259     // ----------------------------------------------------------------------------
260 
261     TNameStore       sequenceNameStore;
262     TNameStoreCache_ _sequenceNameStoreCache;
263 
264     TMatchStore       matchStore;
265     TCigarStore       cigarStore;
266 
LocalMatchStore()267     LocalMatchStore() :
268             _sequenceNameStoreCache(sequenceNameStore)
269     {}
270 };
271 
272 // ============================================================================
273 // Metafunctions
274 // ============================================================================
275 
276 // ============================================================================
277 // Functions
278 // ============================================================================
279 
280 // ----------------------------------------------------------------------------
281 // Function registerSequenceName
282 // ----------------------------------------------------------------------------
283 
284 template <typename TLocalMatchStore>
285 inline void
registerSequenceName(TLocalMatchStore & store,CharString const & sequenceName)286 registerSequenceName(TLocalMatchStore & store,
287                      CharString const & sequenceName)
288 {
289     unsigned id = 0;
290     if (!getIdByName(store.sequenceNameStore, sequenceName, id, store._sequenceNameStoreCache))
291     {
292         id = length(store.sequenceNameStore);
293         appendName(store.sequenceNameStore, sequenceName, store._sequenceNameStoreCache);
294     }
295 }
296 
297 // ----------------------------------------------------------------------------
298 // Function appendLocalMatch
299 // ----------------------------------------------------------------------------
300 
301 /*!
302  * @fn LocalMatchStore#appendLocalMatch
303  * @brief Append a new local match to a @link LocalMatchStore @endlink
304  *
305  * @signature void appendLocalMatchStore(store, subjectId, subjectBeginPos, subjectEndPos, queryId, queryBeginPos, queryEndPos);
306  * @signature void appendLocalMatchStore(store, subjectName, subjectBeginPos, subjectEndPos, queryName, queryBeginPos, queryEndPos, cigarStringBuffer);
307  *
308  * @param[in,out] store             The LocalMatchStore to add the local match to.
309  * @param[in]     subjectId         Numeric subject sequence identifier, @link IntegerConcept @endlink.
310  * @param[in]     subjectName       The textual name of the query sequence, @link CharString @endlink.
311  * @param[in]     subjectBegin      The begin position of the match in the subject, @link IntegerConcept @endlink.
312  * @param[in]     subjectEnd        The end position of the match in the subject, @link IntegerConcept @endlink.
313  * @param[in]     queryId           Numeric query sequence identifier, @link IntegerConcept @endlink.
314  * @param[in]     queryName         The textual name of the query sequence, @link CharString @endlink.
315  * @param[in]     queryBegin        The begin position of the match in the query, @link IntegerConcept @endlink.
316  * @param[in]     queryEnd          The end position of the match in the query, @link IntegerConcept @endlink.
317  * @param[in]     cigarStringBuffer Buffer with the cigar string of the local alignment,  @link CharString @endlink.
318  *
319  * Matches on the reverse-complement are encoded by the begin position being greater than the begin position.
320  */
321 
322 template <typename TLocalMatchStore, typename TPosition>
323 inline void
appendLocalMatch(TLocalMatchStore & store,unsigned const & subjectId,TPosition const subjectBeginPos,TPosition const subjectEndPos,unsigned const & queryId,TPosition const queryBeginPos,TPosition const queryEndPos)324 appendLocalMatch(TLocalMatchStore & store,
325                  unsigned const & subjectId,
326                  TPosition const subjectBeginPos,
327                  TPosition const subjectEndPos,
328                  unsigned const & queryId,
329                  TPosition const queryBeginPos,
330                  TPosition const queryEndPos)
331 {
332     typedef typename TLocalMatchStore::TLocalMatch TLocalMatch;
333 
334     SEQAN_ASSERT_LT(subjectId, length(store.sequenceNameStore));
335     SEQAN_ASSERT_LT(queryId, length(store.sequenceNameStore));
336 
337     TLocalMatch localMatch(length(store.matchStore),
338                            subjectId,
339                            subjectBeginPos,
340                            subjectEndPos,
341                            queryId,
342                            queryBeginPos,
343                            queryEndPos);
344     appendValue(store.matchStore, localMatch);
345 }
346 
347 template <typename TLocalMatchStore, typename TPosition>
348 inline void
appendLocalMatch(TLocalMatchStore & store,CharString const & subjectName,TPosition const subjectBeginPos,TPosition const subjectEndPos,CharString const & queryName,TPosition const queryBeginPos,TPosition const queryEndPos)349 appendLocalMatch(TLocalMatchStore & store,
350                  CharString const & subjectName,
351                  TPosition const subjectBeginPos,
352                  TPosition const subjectEndPos,
353                  CharString const & queryName,
354                  TPosition const queryBeginPos,
355                  TPosition const queryEndPos)
356 {
357     typedef typename TLocalMatchStore::TId         TId;
358 
359     // Get id for subject and query sequences;  Insert sequences into name stores/caches if not already there.
360     TId subjectId = 0;
361     if (!getIdByName(store.sequenceNameStore, subjectName, subjectId, store._sequenceNameStoreCache))
362     {
363         subjectId = length(store.sequenceNameStore);
364         appendName(store.sequenceNameStore, subjectName, store._sequenceNameStoreCache);
365     }
366     TId queryId = 0;
367     if (!getIdByName(store.sequenceNameStore, queryName, queryId, store._sequenceNameStoreCache))
368     {
369         queryId = length(store.sequenceNameStore);
370         appendName(store.sequenceNameStore, queryName, store._sequenceNameStoreCache);
371     }
372 
373     appendLocalMatch(store, subjectId, subjectBeginPos, subjectEndPos, queryId, queryBeginPos, queryEndPos);
374 }
375 
376 template <typename TLocalMatchStore, typename TPosition>
377 inline void
appendLocalMatch(TLocalMatchStore & store,CharString const & subjectName,TPosition const subjectBeginPos,TPosition const subjectEndPos,CharString const & queryName,TPosition const queryBeginPos,TPosition const queryEndPos,CharString const & cigarStringBuffer)378 appendLocalMatch(TLocalMatchStore & store,
379                  CharString const & subjectName,
380                  TPosition const subjectBeginPos,
381                  TPosition const subjectEndPos,
382                  CharString const & queryName,
383                  TPosition const queryBeginPos,
384                  TPosition const queryEndPos,
385                  CharString const & cigarStringBuffer)
386 {
387     // Append local match.
388     appendLocalMatch(store, subjectName, subjectBeginPos, subjectEndPos, queryName, queryBeginPos, queryEndPos);
389     // Make space for CIGAR string.
390     resize(store.cigarStore, back(store.matchStore).id + 1);
391     // TODO(holtgrew): Something can go wrong when parsing CIGAR string, need return value?
392     // Parse out cigar string.
393     /*
394     typedef Stream<CharArray<char const *> > TCharArrayStream;
395     TCharArrayStream cigarStream(&cigarStringBuffer[0], &cigarStringBuffer[0] + length(cigarStringBuffer));
396     RecordReader<TCharArrayStream, SinglePass<> > recordReader(cigarStream);
397     */
398     DirectionIterator<CharString const, Input>::Type iter = begin(cigarStringBuffer);
399     CharString numBuf;
400     while (!atEnd(iter))
401     {
402         // Get number string into buffer.
403         clear(numBuf);
404         readUntil(numBuf, iter, NotFunctor<IsDigit>());
405         unsigned num = 0;
406         lexicalCast<unsigned>(num, numBuf);
407 
408         // Read operation char and advance record reader.
409         char op;
410         readOne(op, iter);
411         // Append CIGAR element to CIGAR string.
412         appendValue(back(store.cigarStore), CigarElement<>(op, num));
413     }
414 }
415 
416 }  // namespace seqan
417 
418 #endif  // INCLUDE_SEQAN_PARSE_LM_LOCAL_MATCH_STORE_H_
419