1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Hannes Hauswedell <hannes.hauswedell@fu-berlin.de>
33 // ==========================================================================
34 // Tests for reduced_aminoacid module
35 // ==========================================================================
36 
37 #ifndef SEQAN_TESTS_REDUCED_ALPHABET_H_
38 #define SEQAN_TESTS_REDUCED_ALPHABET_H_
39 
40 #include <seqan/basic.h>
41 #include <seqan/sequence.h>
42 #include <seqan/file.h>
43 
44 #include <seqan/reduced_aminoacid.h>
45 #include <seqan/modifier.h>
46 #include <seqan/index.h>
47 
48 using namespace seqan;
49 
50 #if 0
51 SEQAN_DEFINE_TEST(test_reduced_aminoacid_cluster_red)
52 {
53     typedef SimpleType<unsigned char, ReducedAminoAcid_<ClusterReduction<8> > >
54             ReducedAminoAcid24to8;
55     typedef SimpleType<unsigned char, ReducedAminoAcid_<ClusterReduction<10> > >
56             ReducedAminoAcid24to10;
57     typedef SimpleType<unsigned char, ReducedAminoAcid_<ClusterReduction<12> > >
58             ReducedAminoAcid24to12;
59 
60     CharString str = "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz*+#";
61     String<AminoAcid> aas = "ARNDCQEGHILKMFPSTWYVBZX*";
62 
63     // N = 12
64     {
65         String<ReducedAminoAcid24to12> conv = str;
66         SEQAN_ASSERT_EQ(
67             CharString(conv),
68             "AANNCCNNRRFFGGHHIISSRRIIIINNSSPPRRRRSSSSSSIIWWSSFFRR*SS");
69         conv = aas;
70         SEQAN_ASSERT_EQ(CharString(conv), "ARNNCRRGHIIRIFPSSWFINRS*");
71     }
72 
73     // N = 10
74     {
75         String<ReducedAminoAcid24to10> conv = str;
76         SEQAN_ASSERT_EQ(
77             CharString(conv),
78             "AANNCCNNRRFFGGHHIIAARRIIIINNAAPPRRRRAAAAAAIIFFAAFFRR*AA");
79         conv = aas;
80         SEQAN_ASSERT_EQ(CharString(conv), "ARNNCRRGHIIRIFPAAFFINRA*");
81     }
82 
83     // N = 8
84     {
85         String<ReducedAminoAcid24to8> conv = str;
86         SEQAN_ASSERT_EQ(
87             CharString(conv),
88             "AARRCCRRRRFFGGRRIIAARRIIIIRRAAPPRRRRAAAAAAIIFFAAFFRR*AA");
89         conv = aas;
90         SEQAN_ASSERT_EQ(CharString(conv), "ARRRCRRGRIIRIFPAAFFIRRA*");
91     }
92 }
93 #endif
94 
SEQAN_DEFINE_TEST(test_reduced_aminoacid_buchfink11)95 SEQAN_DEFINE_TEST(test_reduced_aminoacid_buchfink11)
96 {
97     typedef SimpleType<unsigned char, ReducedAminoAcid_<Buchfink11> >
98             ReducedAminoAcidBuchfink11;
99 
100     CharString str = "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz*+#";
101     String<AminoAcid> aas = "ABCDEFGHIJKLMNOPQRSTUVWYZX*";
102 
103     // N = 11
104     {
105         String<ReducedAminoAcidBuchfink11> conv = str;
106         SEQAN_ASSERT_EQ(
107             CharString(conv),
108             "AABBCCBBBBFFGGHHIIIIBBIIMMBBBBPPBBBBAAAACCIIWWAAYYBBFAA");
109         conv = aas;
110         SEQAN_ASSERT_EQ(CharString(conv), "ABCBBFGHIIBIMBBPBBAACIWYBAF");
111     }
112 }
113 
SEQAN_DEFINE_TEST(test_reduced_aminoacid_cannata10)114 SEQAN_DEFINE_TEST(test_reduced_aminoacid_cannata10)
115 {
116     typedef SimpleType<unsigned char, ReducedAminoAcid_<Cannata10> >
117             ReducedAminoAcidCannata10;
118 
119     CharString str = "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz*+#";
120     String<AminoAcid> aas = "ABCDEFGHIJKLMNOPQRSTUVWYZX*";
121 
122     // N = 10
123     {
124         String<ReducedAminoAcidCannata10> conv = str;
125         SEQAN_ASSERT_EQ(
126             CharString(conv),
127             "AABBCCBBEEFFAAHHIIIIKKIIIIBBKKPPEEKKAAAACCIIWWAAFFEEFAA");
128         conv = aas;
129         SEQAN_ASSERT_EQ(CharString(conv), "ABCBEFAHIIKIIBKPEKAACIWFEAF");
130     }
131 }
132 
SEQAN_DEFINE_TEST(test_reduced_aminoacid_li10)133 SEQAN_DEFINE_TEST(test_reduced_aminoacid_li10)
134 {
135     typedef SimpleType<unsigned char, ReducedAminoAcid_<Li10> >
136             ReducedAminoAcidLi10;
137 
138     CharString str = "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz*+#";
139     String<AminoAcid> aas = "ABCDEFGHIJKLMNOPQRSTUVWYZX*";
140 
141     // N = 10
142     {
143         String<ReducedAminoAcidLi10> conv = str;
144         SEQAN_ASSERT_EQ(
145             CharString(conv),
146             "AABBCCBBBBFFGGHHIIJJKKJJJJHHKKPPBBKKAAAACCIIFFAAFFBBFAA");
147         conv = aas;
148         SEQAN_ASSERT_EQ(CharString(conv), "ABCBBFGHIJKJJHKPBKAACIFFBAF");
149     }
150 }
151 
SEQAN_DEFINE_TEST(test_reduced_aminoacid_solis10)152 SEQAN_DEFINE_TEST(test_reduced_aminoacid_solis10)
153 {
154     typedef SimpleType<unsigned char, ReducedAminoAcid_<Solis10> >
155             ReducedAminoAcidSolis10;
156 
157     CharString str = "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz*+#";
158     String<AminoAcid> aas = "ABCDEFGHIJKLMNOPQRSTUVWYZX*";
159 
160     // N = 10
161     {
162         String<ReducedAminoAcidSolis10> conv = str;
163         SEQAN_ASSERT_EQ(
164             CharString(conv),
165             "AABBCCBBBBFFGGHHIIIIKKIIIIGGHHPPGGHHGGPPCCIIWWAAWWBBFAA");
166         conv = aas;
167         SEQAN_ASSERT_EQ(CharString(conv), "ABCBBFGHIIKIIGHPGHGPCIWWBAF");
168     }
169 }
170 
SEQAN_DEFINE_TEST(test_reduced_aminoacid_murphy5)171 SEQAN_DEFINE_TEST(test_reduced_aminoacid_murphy5)
172 {
173     typedef SimpleType<unsigned char, ReducedAminoAcid_<Murphy5> >
174             ReducedAminoAcidMurphy5;
175 
176     CharString str = "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz*+#";
177     String<AminoAcid> aas = "ABCDEFGHIJKLMNOPQRSTUVWYZX*";
178 
179     // N = 5
180     {
181         String<ReducedAminoAcidMurphy5> conv = str;
182         SEQAN_ASSERT_EQ(
183             CharString(conv),
184             "AABBCCBBBBFFAAHHCCCCHHCCCCBBHHAABBHHAAAACCCCFFAAFFBBFAA");
185         conv = aas;
186         SEQAN_ASSERT_EQ(CharString(conv), "ABCBBFAHCCHCCBHABHAACCFFBAF");
187     }
188 }
189 
SEQAN_DEFINE_TEST(test_reduced_aminoacid_murphy10)190 SEQAN_DEFINE_TEST(test_reduced_aminoacid_murphy10)
191 {
192     typedef SimpleType<unsigned char, ReducedAminoAcid_<Murphy10> >
193             ReducedAminoAcidMurphy10;
194 
195     CharString str = "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz*+#";
196     String<AminoAcid> aas = "ABCDEFGHIJKLMNOPQRSTUVWYZX*";
197 
198     // N = 10
199     {
200         String<ReducedAminoAcidMurphy10> conv = str;
201         SEQAN_ASSERT_EQ(
202             CharString(conv),
203             "AABBCCBBBBFFGGHHIIIIKKIIIIBBAAPPBBKKSSSSAAIIFFAAFFBBFAA");
204         conv = aas;
205         SEQAN_ASSERT_EQ(CharString(conv), "ABCBBFGHIIKIIBAPBKSSAIFFBAF");
206     }
207 }
208 
209 template <typename TModString>
_testReducedAminoAcidMurphy10ModIteratorsImpl(TModString & conv)210 void _testReducedAminoAcidMurphy10ModIteratorsImpl(TModString & conv)
211 {
212     typedef typename Iterator<TModString, Standard>::Type TIt;
213     typedef typename Iterator<TModString, Rooted>::Type TItR;
214 
215     CharString toCharString = conv;
216     SEQAN_ASSERT_EQ(toCharString,
217                     "ABCBBFGHIIKIIBAPBKSSAIFFBAF");
218 
219     // iterating
220     {
221         unsigned c = 0;
222         for (TIt it = begin(conv, Standard()), itEnd = end(conv, Standard());
223              it != itEnd;
224              ++it, ++c)
225             SEQAN_ASSERT_EQ(char(*it), toCharString[c]);
226     }
227 
228     // atBegin, atEnd, position (Standard)
229     {
230         TIt it = begin(conv, Standard());
231         SEQAN_ASSERT(atBegin(it, conv));
232         SEQAN_ASSERT_EQ(position(it, conv), 0u);
233 
234         it = end(conv, Standard());
235         SEQAN_ASSERT(atEnd(it, conv));
236         SEQAN_ASSERT_EQ(position(it, conv), length(conv));
237     }
238 
239     // atBegin, atEnd, position (Rooted)
240     {
241         TItR it = begin(conv, Rooted());
242         SEQAN_ASSERT(atBegin(it));
243         SEQAN_ASSERT(atBegin(it, conv));
244         SEQAN_ASSERT_EQ(position(it), 0u);
245         SEQAN_ASSERT_EQ(position(it, conv), 0u);
246 
247         it = end(conv, Rooted());
248         SEQAN_ASSERT(atEnd(it));
249         SEQAN_ASSERT(atEnd(it, conv));
250         SEQAN_ASSERT_EQ(position(it), length(conv));
251         SEQAN_ASSERT_EQ(position(it, conv), length(conv));
252     }
253 }
254 
SEQAN_DEFINE_TEST(test_reduced_aminoacid_murphy10_moditerators)255 SEQAN_DEFINE_TEST(test_reduced_aminoacid_murphy10_moditerators)
256 {
257     typedef SimpleType<unsigned char, ReducedAminoAcid_<Murphy10> >
258             ReducedAminoAcidMurphy10;
259     typedef ModifiedString<String<AminoAcid>,
260                            ModView<FunctorConvert<AminoAcid, ReducedAminoAcidMurphy10>>> TModString;
261     String<AminoAcid> aas = "ABCDEFGHIJKLMNOPQRSTUVWYZX*";
262 
263     TModString conv(aas);
264     _testReducedAminoAcidMurphy10ModIteratorsImpl(conv);
265 
266     TModString const conv2(aas);
267     _testReducedAminoAcidMurphy10ModIteratorsImpl(conv2);
268 
269     Segment<TModString, InfixSegment> convinf = infix(conv, 0, length(conv));
270     _testReducedAminoAcidMurphy10ModIteratorsImpl(convinf);
271 
272     Segment<TModString const, InfixSegment> conv2inf = infix(conv2, 0, length(conv));
273     _testReducedAminoAcidMurphy10ModIteratorsImpl(conv2inf);
274 }
275 
276 struct ReducedFMIndexConfig_
277 {
278     typedef size_t                                                          LengthSum;
279     typedef WaveletTree<void, WTRDConfig<LengthSum, Alloc<>, 1> >           Bwt;
280     typedef Levels<void, LevelsRDConfig<LengthSum, Alloc<>, 1> >            Sentinels;
281 
282     static const unsigned SAMPLING = 10;
283 };
284 
SEQAN_DEFINE_TEST(test_reduced_aminoacid_murphy10_modview_fmindex)285 SEQAN_DEFINE_TEST(test_reduced_aminoacid_murphy10_modview_fmindex)
286 {
287     typedef String<AminoAcid>                                               TOrigString;
288     typedef StringSet<TOrigString, Owner<ConcatDirect<> > >                 TOrigSet;
289 
290     typedef SimpleType<unsigned char, ReducedAminoAcid_<Murphy10> >         ReducedAminoAcidMurphy10;
291     typedef ModView<FunctorConvert<AminoAcid, ReducedAminoAcidMurphy10> >   TModView;
292     typedef ModifiedString<TOrigString, TModView>                           TModString;
293     typedef StringSet<TModString, Owner<ConcatDirect<> > >                  TModSet;
294 
295     typedef FMIndex<void, ReducedFMIndexConfig_>                            TFMIndex;
296 
297     TOrigSet origSet;
298     appendValue(origSet, "ABCDEFGHIJKLMNOPQRSTUVWYZX*");
299     appendValue(origSet, "ABABABABABABILMVILMVILMVABABABABAB");
300     appendValue(origSet, "ABCDEFGHIJKLMNOPQRSTUVWYZX*LLLLL");
301     reverse(origSet); // FM-Index is reversed o_O
302 
303     TModSet modSet(origSet);
304     SEQAN_ASSERT_EQ(modSet[0], "FABFFIASSKBPABIIKIIHGFBBCBA");
305     SEQAN_ASSERT_EQ(modSet[1], "BABABABABAIIIIIIIIIIIIBABABABABABA");
306     SEQAN_ASSERT_EQ(modSet[2], "IIIIIFABFFIASSKBPABIIKIIHGFBBCBA");
307 
308     TOrigString query = "VVVVV";
309     TModString modQuery(query);
310     SEQAN_ASSERT_EQ(modQuery, "IIIII");
311 
312     Index<TModSet, TFMIndex> index(modSet);
313     indexRequire(index, FibreSALF());         // instantiate
314 
315     // actual search is only done if lambdas are available
316     typedef typename Iterator<Index<TModSet, TFMIndex>, TopDown<>>::Type TIndexIt;
317 
318     std::vector<std::pair<uint64_t, uint64_t>> hits;
319     auto callback = [&] (TIndexIt & indexIt, int)
320     {
321         auto const & occurrences = getOccurrences(indexIt);
322         for (auto it = begin(occurrences), itEnd = end(occurrences); it != itEnd; ++it)
323         {
324             auto subjOcc = *it;
325             // reverse positions again
326             setSeqOffset(subjOcc,
327                          length(origSet[getSeqNo(subjOcc)])
328                          - getSeqOffset(subjOcc)
329                          - length(query));
330             hits.emplace_back(getSeqNo(subjOcc), getSeqOffset(subjOcc));
331         }
332     };
333 
334     Nothing nothing;
335     _findImpl(nothing, index, modQuery, int(0), callback, Backtracking<Exact>());
336 
337     SEQAN_ASSERT_EQ(length(hits), 9u);
338     SEQAN_ASSERT_EQ(std::get<0>(hits[0]), 1u); SEQAN_ASSERT_EQ(std::get<1>(hits[0]), 12u);
339     SEQAN_ASSERT_EQ(std::get<0>(hits[1]), 2u); SEQAN_ASSERT_EQ(std::get<1>(hits[1]), 27u);
340     for (unsigned i = 1; i < 7; ++i)
341     {
342         SEQAN_ASSERT_EQ(std::get<0>(hits[1+i]), 1u); SEQAN_ASSERT_EQ(std::get<1>(hits[1+i]), 12u +i);
343     }
344 }
345 
346 #endif  // SEQAN_TESTS_REDUCED_ALPHABET_H_
347