1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Hannes Hauswedell <hannes.hauswedell@fu-berlin.de>
33 // ==========================================================================
34 // Tests for reduced_aminoacid module
35 // ==========================================================================
36
37 #ifndef SEQAN_TESTS_REDUCED_ALPHABET_H_
38 #define SEQAN_TESTS_REDUCED_ALPHABET_H_
39
40 #include <seqan/basic.h>
41 #include <seqan/sequence.h>
42 #include <seqan/file.h>
43
44 #include <seqan/reduced_aminoacid.h>
45 #include <seqan/modifier.h>
46 #include <seqan/index.h>
47
48 using namespace seqan;
49
50 #if 0
51 SEQAN_DEFINE_TEST(test_reduced_aminoacid_cluster_red)
52 {
53 typedef SimpleType<unsigned char, ReducedAminoAcid_<ClusterReduction<8> > >
54 ReducedAminoAcid24to8;
55 typedef SimpleType<unsigned char, ReducedAminoAcid_<ClusterReduction<10> > >
56 ReducedAminoAcid24to10;
57 typedef SimpleType<unsigned char, ReducedAminoAcid_<ClusterReduction<12> > >
58 ReducedAminoAcid24to12;
59
60 CharString str = "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz*+#";
61 String<AminoAcid> aas = "ARNDCQEGHILKMFPSTWYVBZX*";
62
63 // N = 12
64 {
65 String<ReducedAminoAcid24to12> conv = str;
66 SEQAN_ASSERT_EQ(
67 CharString(conv),
68 "AANNCCNNRRFFGGHHIISSRRIIIINNSSPPRRRRSSSSSSIIWWSSFFRR*SS");
69 conv = aas;
70 SEQAN_ASSERT_EQ(CharString(conv), "ARNNCRRGHIIRIFPSSWFINRS*");
71 }
72
73 // N = 10
74 {
75 String<ReducedAminoAcid24to10> conv = str;
76 SEQAN_ASSERT_EQ(
77 CharString(conv),
78 "AANNCCNNRRFFGGHHIIAARRIIIINNAAPPRRRRAAAAAAIIFFAAFFRR*AA");
79 conv = aas;
80 SEQAN_ASSERT_EQ(CharString(conv), "ARNNCRRGHIIRIFPAAFFINRA*");
81 }
82
83 // N = 8
84 {
85 String<ReducedAminoAcid24to8> conv = str;
86 SEQAN_ASSERT_EQ(
87 CharString(conv),
88 "AARRCCRRRRFFGGRRIIAARRIIIIRRAAPPRRRRAAAAAAIIFFAAFFRR*AA");
89 conv = aas;
90 SEQAN_ASSERT_EQ(CharString(conv), "ARRRCRRGRIIRIFPAAFFIRRA*");
91 }
92 }
93 #endif
94
SEQAN_DEFINE_TEST(test_reduced_aminoacid_buchfink11)95 SEQAN_DEFINE_TEST(test_reduced_aminoacid_buchfink11)
96 {
97 typedef SimpleType<unsigned char, ReducedAminoAcid_<Buchfink11> >
98 ReducedAminoAcidBuchfink11;
99
100 CharString str = "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz*+#";
101 String<AminoAcid> aas = "ABCDEFGHIJKLMNOPQRSTUVWYZX*";
102
103 // N = 11
104 {
105 String<ReducedAminoAcidBuchfink11> conv = str;
106 SEQAN_ASSERT_EQ(
107 CharString(conv),
108 "AABBCCBBBBFFGGHHIIIIBBIIMMBBBBPPBBBBAAAACCIIWWAAYYBBFAA");
109 conv = aas;
110 SEQAN_ASSERT_EQ(CharString(conv), "ABCBBFGHIIBIMBBPBBAACIWYBAF");
111 }
112 }
113
SEQAN_DEFINE_TEST(test_reduced_aminoacid_cannata10)114 SEQAN_DEFINE_TEST(test_reduced_aminoacid_cannata10)
115 {
116 typedef SimpleType<unsigned char, ReducedAminoAcid_<Cannata10> >
117 ReducedAminoAcidCannata10;
118
119 CharString str = "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz*+#";
120 String<AminoAcid> aas = "ABCDEFGHIJKLMNOPQRSTUVWYZX*";
121
122 // N = 10
123 {
124 String<ReducedAminoAcidCannata10> conv = str;
125 SEQAN_ASSERT_EQ(
126 CharString(conv),
127 "AABBCCBBEEFFAAHHIIIIKKIIIIBBKKPPEEKKAAAACCIIWWAAFFEEFAA");
128 conv = aas;
129 SEQAN_ASSERT_EQ(CharString(conv), "ABCBEFAHIIKIIBKPEKAACIWFEAF");
130 }
131 }
132
SEQAN_DEFINE_TEST(test_reduced_aminoacid_li10)133 SEQAN_DEFINE_TEST(test_reduced_aminoacid_li10)
134 {
135 typedef SimpleType<unsigned char, ReducedAminoAcid_<Li10> >
136 ReducedAminoAcidLi10;
137
138 CharString str = "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz*+#";
139 String<AminoAcid> aas = "ABCDEFGHIJKLMNOPQRSTUVWYZX*";
140
141 // N = 10
142 {
143 String<ReducedAminoAcidLi10> conv = str;
144 SEQAN_ASSERT_EQ(
145 CharString(conv),
146 "AABBCCBBBBFFGGHHIIJJKKJJJJHHKKPPBBKKAAAACCIIFFAAFFBBFAA");
147 conv = aas;
148 SEQAN_ASSERT_EQ(CharString(conv), "ABCBBFGHIJKJJHKPBKAACIFFBAF");
149 }
150 }
151
SEQAN_DEFINE_TEST(test_reduced_aminoacid_solis10)152 SEQAN_DEFINE_TEST(test_reduced_aminoacid_solis10)
153 {
154 typedef SimpleType<unsigned char, ReducedAminoAcid_<Solis10> >
155 ReducedAminoAcidSolis10;
156
157 CharString str = "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz*+#";
158 String<AminoAcid> aas = "ABCDEFGHIJKLMNOPQRSTUVWYZX*";
159
160 // N = 10
161 {
162 String<ReducedAminoAcidSolis10> conv = str;
163 SEQAN_ASSERT_EQ(
164 CharString(conv),
165 "AABBCCBBBBFFGGHHIIIIKKIIIIGGHHPPGGHHGGPPCCIIWWAAWWBBFAA");
166 conv = aas;
167 SEQAN_ASSERT_EQ(CharString(conv), "ABCBBFGHIIKIIGHPGHGPCIWWBAF");
168 }
169 }
170
SEQAN_DEFINE_TEST(test_reduced_aminoacid_murphy5)171 SEQAN_DEFINE_TEST(test_reduced_aminoacid_murphy5)
172 {
173 typedef SimpleType<unsigned char, ReducedAminoAcid_<Murphy5> >
174 ReducedAminoAcidMurphy5;
175
176 CharString str = "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz*+#";
177 String<AminoAcid> aas = "ABCDEFGHIJKLMNOPQRSTUVWYZX*";
178
179 // N = 5
180 {
181 String<ReducedAminoAcidMurphy5> conv = str;
182 SEQAN_ASSERT_EQ(
183 CharString(conv),
184 "AABBCCBBBBFFAAHHCCCCHHCCCCBBHHAABBHHAAAACCCCFFAAFFBBFAA");
185 conv = aas;
186 SEQAN_ASSERT_EQ(CharString(conv), "ABCBBFAHCCHCCBHABHAACCFFBAF");
187 }
188 }
189
SEQAN_DEFINE_TEST(test_reduced_aminoacid_murphy10)190 SEQAN_DEFINE_TEST(test_reduced_aminoacid_murphy10)
191 {
192 typedef SimpleType<unsigned char, ReducedAminoAcid_<Murphy10> >
193 ReducedAminoAcidMurphy10;
194
195 CharString str = "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz*+#";
196 String<AminoAcid> aas = "ABCDEFGHIJKLMNOPQRSTUVWYZX*";
197
198 // N = 10
199 {
200 String<ReducedAminoAcidMurphy10> conv = str;
201 SEQAN_ASSERT_EQ(
202 CharString(conv),
203 "AABBCCBBBBFFGGHHIIIIKKIIIIBBAAPPBBKKSSSSAAIIFFAAFFBBFAA");
204 conv = aas;
205 SEQAN_ASSERT_EQ(CharString(conv), "ABCBBFGHIIKIIBAPBKSSAIFFBAF");
206 }
207 }
208
209 template <typename TModString>
_testReducedAminoAcidMurphy10ModIteratorsImpl(TModString & conv)210 void _testReducedAminoAcidMurphy10ModIteratorsImpl(TModString & conv)
211 {
212 typedef typename Iterator<TModString, Standard>::Type TIt;
213 typedef typename Iterator<TModString, Rooted>::Type TItR;
214
215 CharString toCharString = conv;
216 SEQAN_ASSERT_EQ(toCharString,
217 "ABCBBFGHIIKIIBAPBKSSAIFFBAF");
218
219 // iterating
220 {
221 unsigned c = 0;
222 for (TIt it = begin(conv, Standard()), itEnd = end(conv, Standard());
223 it != itEnd;
224 ++it, ++c)
225 SEQAN_ASSERT_EQ(char(*it), toCharString[c]);
226 }
227
228 // atBegin, atEnd, position (Standard)
229 {
230 TIt it = begin(conv, Standard());
231 SEQAN_ASSERT(atBegin(it, conv));
232 SEQAN_ASSERT_EQ(position(it, conv), 0u);
233
234 it = end(conv, Standard());
235 SEQAN_ASSERT(atEnd(it, conv));
236 SEQAN_ASSERT_EQ(position(it, conv), length(conv));
237 }
238
239 // atBegin, atEnd, position (Rooted)
240 {
241 TItR it = begin(conv, Rooted());
242 SEQAN_ASSERT(atBegin(it));
243 SEQAN_ASSERT(atBegin(it, conv));
244 SEQAN_ASSERT_EQ(position(it), 0u);
245 SEQAN_ASSERT_EQ(position(it, conv), 0u);
246
247 it = end(conv, Rooted());
248 SEQAN_ASSERT(atEnd(it));
249 SEQAN_ASSERT(atEnd(it, conv));
250 SEQAN_ASSERT_EQ(position(it), length(conv));
251 SEQAN_ASSERT_EQ(position(it, conv), length(conv));
252 }
253 }
254
SEQAN_DEFINE_TEST(test_reduced_aminoacid_murphy10_moditerators)255 SEQAN_DEFINE_TEST(test_reduced_aminoacid_murphy10_moditerators)
256 {
257 typedef SimpleType<unsigned char, ReducedAminoAcid_<Murphy10> >
258 ReducedAminoAcidMurphy10;
259 typedef ModifiedString<String<AminoAcid>,
260 ModView<FunctorConvert<AminoAcid, ReducedAminoAcidMurphy10>>> TModString;
261 String<AminoAcid> aas = "ABCDEFGHIJKLMNOPQRSTUVWYZX*";
262
263 TModString conv(aas);
264 _testReducedAminoAcidMurphy10ModIteratorsImpl(conv);
265
266 TModString const conv2(aas);
267 _testReducedAminoAcidMurphy10ModIteratorsImpl(conv2);
268
269 Segment<TModString, InfixSegment> convinf = infix(conv, 0, length(conv));
270 _testReducedAminoAcidMurphy10ModIteratorsImpl(convinf);
271
272 Segment<TModString const, InfixSegment> conv2inf = infix(conv2, 0, length(conv));
273 _testReducedAminoAcidMurphy10ModIteratorsImpl(conv2inf);
274 }
275
276 struct ReducedFMIndexConfig_
277 {
278 typedef size_t LengthSum;
279 typedef WaveletTree<void, WTRDConfig<LengthSum, Alloc<>, 1> > Bwt;
280 typedef Levels<void, LevelsRDConfig<LengthSum, Alloc<>, 1> > Sentinels;
281
282 static const unsigned SAMPLING = 10;
283 };
284
SEQAN_DEFINE_TEST(test_reduced_aminoacid_murphy10_modview_fmindex)285 SEQAN_DEFINE_TEST(test_reduced_aminoacid_murphy10_modview_fmindex)
286 {
287 typedef String<AminoAcid> TOrigString;
288 typedef StringSet<TOrigString, Owner<ConcatDirect<> > > TOrigSet;
289
290 typedef SimpleType<unsigned char, ReducedAminoAcid_<Murphy10> > ReducedAminoAcidMurphy10;
291 typedef ModView<FunctorConvert<AminoAcid, ReducedAminoAcidMurphy10> > TModView;
292 typedef ModifiedString<TOrigString, TModView> TModString;
293 typedef StringSet<TModString, Owner<ConcatDirect<> > > TModSet;
294
295 typedef FMIndex<void, ReducedFMIndexConfig_> TFMIndex;
296
297 TOrigSet origSet;
298 appendValue(origSet, "ABCDEFGHIJKLMNOPQRSTUVWYZX*");
299 appendValue(origSet, "ABABABABABABILMVILMVILMVABABABABAB");
300 appendValue(origSet, "ABCDEFGHIJKLMNOPQRSTUVWYZX*LLLLL");
301 reverse(origSet); // FM-Index is reversed o_O
302
303 TModSet modSet(origSet);
304 SEQAN_ASSERT_EQ(modSet[0], "FABFFIASSKBPABIIKIIHGFBBCBA");
305 SEQAN_ASSERT_EQ(modSet[1], "BABABABABAIIIIIIIIIIIIBABABABABABA");
306 SEQAN_ASSERT_EQ(modSet[2], "IIIIIFABFFIASSKBPABIIKIIHGFBBCBA");
307
308 TOrigString query = "VVVVV";
309 TModString modQuery(query);
310 SEQAN_ASSERT_EQ(modQuery, "IIIII");
311
312 Index<TModSet, TFMIndex> index(modSet);
313 indexRequire(index, FibreSALF()); // instantiate
314
315 // actual search is only done if lambdas are available
316 typedef typename Iterator<Index<TModSet, TFMIndex>, TopDown<>>::Type TIndexIt;
317
318 std::vector<std::pair<uint64_t, uint64_t>> hits;
319 auto callback = [&] (TIndexIt & indexIt, int)
320 {
321 auto const & occurrences = getOccurrences(indexIt);
322 for (auto it = begin(occurrences), itEnd = end(occurrences); it != itEnd; ++it)
323 {
324 auto subjOcc = *it;
325 // reverse positions again
326 setSeqOffset(subjOcc,
327 length(origSet[getSeqNo(subjOcc)])
328 - getSeqOffset(subjOcc)
329 - length(query));
330 hits.emplace_back(getSeqNo(subjOcc), getSeqOffset(subjOcc));
331 }
332 };
333
334 Nothing nothing;
335 _findImpl(nothing, index, modQuery, int(0), callback, Backtracking<Exact>());
336
337 SEQAN_ASSERT_EQ(length(hits), 9u);
338 SEQAN_ASSERT_EQ(std::get<0>(hits[0]), 1u); SEQAN_ASSERT_EQ(std::get<1>(hits[0]), 12u);
339 SEQAN_ASSERT_EQ(std::get<0>(hits[1]), 2u); SEQAN_ASSERT_EQ(std::get<1>(hits[1]), 27u);
340 for (unsigned i = 1; i < 7; ++i)
341 {
342 SEQAN_ASSERT_EQ(std::get<0>(hits[1+i]), 1u); SEQAN_ASSERT_EQ(std::get<1>(hits[1+i]), 12u +i);
343 }
344 }
345
346 #endif // SEQAN_TESTS_REDUCED_ALPHABET_H_
347