1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2015, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 // ==========================================================================
34
35 // TODO(holtgrew): Allow the storage of additional fields.
36 /* This could look as follows:
37
38 The values of the additional fields are stored as strings. Either we store name/value pairs or, more efficiently,
39 store a stringset for each match, with the same number of entries. An additional String/Map maps from field name to
40 index in this string sets.
41 */
42
43 #ifndef INCLUDE_SEQAN_PARSE_LM_LOCAL_MATCH_STORE_H_
44 #define INCLUDE_SEQAN_PARSE_LM_LOCAL_MATCH_STORE_H_
45
46 namespace seqan {
47
48 // ============================================================================
49 // Forwards
50 // ============================================================================
51
52 // ============================================================================
53 // Tags, Classes, Enums
54 // ============================================================================
55
56 /*!
57 * @class LocalMatch
58 * @headerfile <seqan/parse_lm.h>
59 * @brief Stores information about local matches.
60 *
61 * @signature template <typename TId, typename TPosition>
62 * class LocalMatch;
63 *
64 * @tparam TId Type to use for subject/query id.
65 * @tparam TPosition Type to use for storing positions.
66 *
67 * Matches on the reverse-complement are encoded by the begin position being greater than the end position.
68 *
69 * Sequence names are not stored in LocalMatch objects but in the @link LocalMatchStore @endlink they belong to.
70 *
71 * @see LocalMatchStore
72 *
73 *
74 * @var TId LocalMatch::queryId
75 * @brief The id of the query.
76 *
77 * @var TPosition LocalMatch::queryBeginPos
78 * @brief Begin position of local match in the query.
79 *
80 * @var TPosition LocalMatch::queryEndPos
81 * @brief End position of local match in the query.
82 *
83 * @var TPosition LocalMatch::subjectBeginPos
84 * @brief Begin position of local match in the subject.
85 *
86 * @var TPosition LocalMatch::subjectEndPos
87 * @brief End position of local match in the subject.
88 *
89 * @var TId LocalMatch::subjectId
90 * @brief The id of the subject.
91 */
92
93 template <typename TId, typename TPosition>
94 class LocalMatch
95 {
96 public:
97 TId id;
98 TId subjectId;
99 TPosition subjectBeginPos;
100 TPosition subjectEndPos;
101 TId queryId;
102 TPosition queryBeginPos;
103 TPosition queryEndPos;
104
LocalMatch()105 LocalMatch() :
106 id(MaxValue<TId>::VALUE),
107 subjectId(MaxValue<TId>::VALUE),
108 subjectBeginPos(MaxValue<TPosition>::VALUE),
109 subjectEndPos(MaxValue<TPosition>::VALUE),
110 queryId(MaxValue<TId>::VALUE),
111 queryBeginPos(MaxValue<TPosition>::VALUE),
112 queryEndPos(MaxValue<TPosition>::VALUE)
113 {}
114
LocalMatch(TId id_,TId subjectId_,TPosition subjectBeginPos_,TPosition subjectEndPos_,TId queryId_,TPosition queryBeginPos_,TPosition queryEndPos_)115 LocalMatch(TId id_,
116 TId subjectId_,
117 TPosition subjectBeginPos_,
118 TPosition subjectEndPos_,
119 TId queryId_,
120 TPosition queryBeginPos_,
121 TPosition queryEndPos_) :
122 id(id_),
123 subjectId(subjectId_),
124 subjectBeginPos(subjectBeginPos_),
125 subjectEndPos(subjectEndPos_),
126 queryId(queryId_),
127 queryBeginPos(queryBeginPos_),
128 queryEndPos(queryEndPos_)
129 {}
130
131 bool operator==(LocalMatch const & other) const
132 {
133 return id == other.id && subjectId == other.subjectId && subjectBeginPos == other.subjectBeginPos &&
134 subjectEndPos == other.subjectEndPos && queryId == other.queryId &&
135 queryBeginPos == other.queryBeginPos && queryEndPos == other.queryEndPos;
136 }
137 };
138
139 /*!
140 * @class LocalMatchStoreConfig
141 * @headerfile <seqan/parse_lm.h>
142 * @brief Stores information about local matches.
143 *
144 * @signature template <typename TSpec>
145 * struct LocalMatchStoreConfig;
146 *
147 * @tparam TSpec Specializing type.
148 *
149 *
150 * @typedef LocalMatchStoreConfig::TId;
151 * @brief Type to use for ids.
152 * @signature typedef unsigned LocalMatchStoreConfig::TId;
153 *
154 * @typedef LocalMatchStoreConfig::TPosition;
155 * @brief The type to use for positions.
156 * @signature typedef (...) LocalMatchStoreConfig::TPosition;
157 */
158
159 template <typename TSpec>
160 struct LocalMatchStoreConfig
161 {
162 typedef unsigned TId;
163 typedef typename Position<Dna5String>::Type TPosition;
164 };
165
166 /*!
167 * @class LocalMatchStore
168 * @headerfile <seqan/parse_lm.h>
169 * @brief Stores information about local matches.
170 *
171 * @signature template <typename TSpec = void, TConfig = LocalMatchStoreConfig<TSpec> >
172 * struct LocalMatchStore;
173 *
174 * @tparam TSpec Specialization tag.
175 * @tparam TConfig Configuration class.
176 *
177 * The LocalMatchStore provides information about matches. Similar to the @link FragmentStore @endlink, the
178 * information is split into multiple sub stores. Each sub store stores a part of the information.
179 *
180 * The LocalMatchStore#matchStore stores the information about a match. The LocalMatchStore#sequenceNameStore stores
181 * the sequence names. These both sub stores are "core stores", they are filled with consistent information, i.e. for
182 * each sequence id in the matchStore, there has to be a valid entry in the sequenceNameStore.
183 *
184 * The LocalMatchStore#cigarStore optionally stores CIGAR strings for the matches. Its entries are referenced by
185 * <tt>matchStore[i].id</tt>.
186 *
187 * @section Examples
188 *
189 * Read Lastz matches from a link RecordReader and then print them to stdout.
190 *
191 * @code{.cpp}
192 * // Read local alignments from record reader. Note that in real-world code,
193 * // you should have error handling.
194 * LocalMatchStore<> lmStore;
195 * while (!atEnd(recordReader))
196 * readRecord(lmStore, recordReader, StellarGff());
197 *
198 * // Print local alignment information to stdout.
199 * std::cout << "# Reverse strandness is indicated by end < begin\n"
200 * << "#db\tdb_beg\tdb_end\tquery\tq_beg\tq_end\n";
201 * for (unsigned i = 0; i < length(lmStore.matchStore); ++i)
202 * std::cout << lmStore.sequenceNameStore[lmStore.matchStore[i].queryId] << "\t"
203 * << lmStore.matchStore[i].queryBeginPos << "\t"
204 * << lmStore.matchStore[i].queryEndPos << "\t"
205 * << lmStore.sequenceNameStore[lmStore.matchStore[i].subjectId] << "\t"
206 * << lmStore.matchStore[i].subjectBeginPos << "\t"
207 * << lmStore.matchStore[i].subjectEndPos << "\n";
208 * @endcode
209 * @see LocalMatch
210 *
211 * @var TMatchStore LocalMatchStore::matchStore
212 * @brief String storing the LocalMatch local matches.
213 *
214 * @var TStringSet LocalMatchStore::sequenceNameStore
215 * @brief StringSet storing the sequence names.
216 *
217 * @var TCigarString LocalMatchStore::cigarStore
218 * @brief String storing the CIGAR strings.
219 */
220
221 /*!
222 * @fn LocalMatchStore#readRecord
223 *
224 * @headerfile <seqan/parse_lm.h>
225 *
226 * @brief Read Lastz "general" format record.
227 *
228 * @signature int readRecord(store, reader, tag);
229 *
230 * @param[in,out] store LocalMatchStore object to read into.
231 * @param[in,out] stream SinglePassRecordReader to read from.
232 * @param[in] tag The tag for selecting the format, one of BlastnTabular, LastzGeneral, and StellarGff.
233 *
234 * @return int 0 on success, non-0 on errors and EOF
235 */
236
237 template <typename TSpec=void, typename TConfig=LocalMatchStoreConfig<TSpec> >
238 class LocalMatchStore
239 {
240 public:
241 // ----------------------------------------------------------------------------
242 // Typedefs
243 // ----------------------------------------------------------------------------
244
245 typedef typename TConfig::TId TId;
246 typedef typename TConfig::TPosition TPosition;
247
248 typedef LocalMatch<TId, TPosition> TLocalMatch;
249 typedef String<TLocalMatch> TMatchStore;
250
251 typedef String<CigarElement<> > TCigarString;
252 typedef String<TCigarString> TCigarStore;
253
254 typedef StringSet<CharString> TNameStore;
255 typedef NameStoreCache<TNameStore> TNameStoreCache_;
256
257 // ----------------------------------------------------------------------------
258 // Member Variables
259 // ----------------------------------------------------------------------------
260
261 TNameStore sequenceNameStore;
262 TNameStoreCache_ _sequenceNameStoreCache;
263
264 TMatchStore matchStore;
265 TCigarStore cigarStore;
266
LocalMatchStore()267 LocalMatchStore() :
268 _sequenceNameStoreCache(sequenceNameStore)
269 {}
270 };
271
272 // ============================================================================
273 // Metafunctions
274 // ============================================================================
275
276 // ============================================================================
277 // Functions
278 // ============================================================================
279
280 // ----------------------------------------------------------------------------
281 // Function registerSequenceName
282 // ----------------------------------------------------------------------------
283
284 template <typename TLocalMatchStore>
285 inline void
registerSequenceName(TLocalMatchStore & store,CharString const & sequenceName)286 registerSequenceName(TLocalMatchStore & store,
287 CharString const & sequenceName)
288 {
289 unsigned id = 0;
290 if (!getIdByName(store.sequenceNameStore, sequenceName, id, store._sequenceNameStoreCache))
291 {
292 id = length(store.sequenceNameStore);
293 appendName(store.sequenceNameStore, sequenceName, store._sequenceNameStoreCache);
294 }
295 }
296
297 // ----------------------------------------------------------------------------
298 // Function appendLocalMatch
299 // ----------------------------------------------------------------------------
300
301 /*!
302 * @fn LocalMatchStore#appendLocalMatch
303 * @brief Append a new local match to a @link LocalMatchStore @endlink
304 *
305 * @signature void appendLocalMatchStore(store, subjectId, subjectBeginPos, subjectEndPos, queryId, queryBeginPos, queryEndPos);
306 * @signature void appendLocalMatchStore(store, subjectName, subjectBeginPos, subjectEndPos, queryName, queryBeginPos, queryEndPos, cigarStringBuffer);
307 *
308 * @param[in,out] store The LocalMatchStore to add the local match to.
309 * @param[in] subjectId Numeric subject sequence identifier, @link IntegerConcept @endlink.
310 * @param[in] subjectName The textual name of the query sequence, @link CharString @endlink.
311 * @param[in] subjectBegin The begin position of the match in the subject, @link IntegerConcept @endlink.
312 * @param[in] subjectEnd The end position of the match in the subject, @link IntegerConcept @endlink.
313 * @param[in] queryId Numeric query sequence identifier, @link IntegerConcept @endlink.
314 * @param[in] queryName The textual name of the query sequence, @link CharString @endlink.
315 * @param[in] queryBegin The begin position of the match in the query, @link IntegerConcept @endlink.
316 * @param[in] queryEnd The end position of the match in the query, @link IntegerConcept @endlink.
317 * @param[in] cigarStringBuffer Buffer with the cigar string of the local alignment, @link CharString @endlink.
318 *
319 * Matches on the reverse-complement are encoded by the begin position being greater than the begin position.
320 */
321
322 template <typename TLocalMatchStore, typename TPosition>
323 inline void
appendLocalMatch(TLocalMatchStore & store,unsigned const & subjectId,TPosition const subjectBeginPos,TPosition const subjectEndPos,unsigned const & queryId,TPosition const queryBeginPos,TPosition const queryEndPos)324 appendLocalMatch(TLocalMatchStore & store,
325 unsigned const & subjectId,
326 TPosition const subjectBeginPos,
327 TPosition const subjectEndPos,
328 unsigned const & queryId,
329 TPosition const queryBeginPos,
330 TPosition const queryEndPos)
331 {
332 typedef typename TLocalMatchStore::TLocalMatch TLocalMatch;
333
334 SEQAN_ASSERT_LT(subjectId, length(store.sequenceNameStore));
335 SEQAN_ASSERT_LT(queryId, length(store.sequenceNameStore));
336
337 TLocalMatch localMatch(length(store.matchStore),
338 subjectId,
339 subjectBeginPos,
340 subjectEndPos,
341 queryId,
342 queryBeginPos,
343 queryEndPos);
344 appendValue(store.matchStore, localMatch);
345 }
346
347 template <typename TLocalMatchStore, typename TPosition>
348 inline void
appendLocalMatch(TLocalMatchStore & store,CharString const & subjectName,TPosition const subjectBeginPos,TPosition const subjectEndPos,CharString const & queryName,TPosition const queryBeginPos,TPosition const queryEndPos)349 appendLocalMatch(TLocalMatchStore & store,
350 CharString const & subjectName,
351 TPosition const subjectBeginPos,
352 TPosition const subjectEndPos,
353 CharString const & queryName,
354 TPosition const queryBeginPos,
355 TPosition const queryEndPos)
356 {
357 typedef typename TLocalMatchStore::TId TId;
358
359 // Get id for subject and query sequences; Insert sequences into name stores/caches if not already there.
360 TId subjectId = 0;
361 if (!getIdByName(store.sequenceNameStore, subjectName, subjectId, store._sequenceNameStoreCache))
362 {
363 subjectId = length(store.sequenceNameStore);
364 appendName(store.sequenceNameStore, subjectName, store._sequenceNameStoreCache);
365 }
366 TId queryId = 0;
367 if (!getIdByName(store.sequenceNameStore, queryName, queryId, store._sequenceNameStoreCache))
368 {
369 queryId = length(store.sequenceNameStore);
370 appendName(store.sequenceNameStore, queryName, store._sequenceNameStoreCache);
371 }
372
373 appendLocalMatch(store, subjectId, subjectBeginPos, subjectEndPos, queryId, queryBeginPos, queryEndPos);
374 }
375
376 template <typename TLocalMatchStore, typename TPosition>
377 inline void
appendLocalMatch(TLocalMatchStore & store,CharString const & subjectName,TPosition const subjectBeginPos,TPosition const subjectEndPos,CharString const & queryName,TPosition const queryBeginPos,TPosition const queryEndPos,CharString const & cigarStringBuffer)378 appendLocalMatch(TLocalMatchStore & store,
379 CharString const & subjectName,
380 TPosition const subjectBeginPos,
381 TPosition const subjectEndPos,
382 CharString const & queryName,
383 TPosition const queryBeginPos,
384 TPosition const queryEndPos,
385 CharString const & cigarStringBuffer)
386 {
387 // Append local match.
388 appendLocalMatch(store, subjectName, subjectBeginPos, subjectEndPos, queryName, queryBeginPos, queryEndPos);
389 // Make space for CIGAR string.
390 resize(store.cigarStore, back(store.matchStore).id + 1);
391 // TODO(holtgrew): Something can go wrong when parsing CIGAR string, need return value?
392 // Parse out cigar string.
393 /*
394 typedef Stream<CharArray<char const *> > TCharArrayStream;
395 TCharArrayStream cigarStream(&cigarStringBuffer[0], &cigarStringBuffer[0] + length(cigarStringBuffer));
396 RecordReader<TCharArrayStream, SinglePass<> > recordReader(cigarStream);
397 */
398 DirectionIterator<CharString const, Input>::Type iter = begin(cigarStringBuffer);
399 CharString numBuf;
400 while (!atEnd(iter))
401 {
402 // Get number string into buffer.
403 clear(numBuf);
404 readUntil(numBuf, iter, NotFunctor<IsDigit>());
405 unsigned num = 0;
406 lexicalCast<unsigned>(num, numBuf);
407
408 // Read operation char and advance record reader.
409 char op;
410 readOne(op, iter);
411 // Append CIGAR element to CIGAR string.
412 appendValue(back(store.cigarStore), CigarElement<>(op, num));
413 }
414 }
415
416 } // namespace seqan
417
418 #endif // INCLUDE_SEQAN_PARSE_LM_LOCAL_MATCH_STORE_H_
419