1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: David Weese <david.weese@fu-berlin.de>
33 // ==========================================================================
34 
35 #ifndef TESTS_INDEX_TEST_QGRAM_INDEX_H
36 #define TESTS_INDEX_TEST_QGRAM_INDEX_H
37 
38 
39 //////////////////////////////////////////////////////////////////////////////
40 
41 namespace seqan
42 {
43 
SEQAN_DEFINE_TEST(testGappedShapes)44 SEQAN_DEFINE_TEST(testGappedShapes)
45 {
46     String<char> shape_string = "0010011011101";
47     Shape<Dna,GenericShape> shape1;
48     stringToShape(shape1, shape_string);
49     Shape<Dna,GenericShape> shape2 = Shape<Dna,GenericShape>(shape1);
50 
51     SEQAN_ASSERT(shape1.weight == shape2.weight);
52     SEQAN_ASSERT(shape1.span == shape2.span);
53     SEQAN_ASSERT(shape1.diffs == shape2.diffs);
54     SEQAN_ASSERT(length(shape1) == length(shape2));
55     SEQAN_ASSERT(weight(shape1) == weight(shape2));
56 /*
57     Shape<Dna,GenericShape> shape3 = Shape<Dna,GenericShape>(5, 13);
58     shape3[0]=2;
59     shape3[1]=2;
60     shape3[2]=1;
61     shape3[3]=1;
62     shape3[4]=2;
63     shape3[5]=1;
64     shape3[6]=1;
65     shape3[7]=2;
66     for(int i = 0; i < 8; ++i)
67         SEQAN_ASSERT(shape1[i] == shape3[i]);
68 */
69 }
70 
71 
SEQAN_DEFINE_TEST(testUngappedShapes)72 SEQAN_DEFINE_TEST(testUngappedShapes)
73 {
74     Shape<Dna,SimpleShape> shape1;
75     resize(shape1, 4);
76     Shape<Dna,SimpleShape> shape2 = Shape<Dna,SimpleShape>(shape1);
77 
78     SEQAN_ASSERT(shape1.span == shape2.span);
79     SEQAN_ASSERT(shape1.leftFactor == shape2.leftFactor);
80     SEQAN_ASSERT(length(shape1) == length(shape2));
81     SEQAN_ASSERT(weight(shape1) == weight(shape2));
82 
83 
84     Shape<Dna,SimpleShape> shape3 = Shape<Dna,SimpleShape>(4);
85     SEQAN_ASSERT(shape3.leftFactor == 64);
86     SEQAN_ASSERT(shape1.leftFactor == shape3.leftFactor);
87 
88 
89 }
90 
91 template <typename TIndex>
testStepSize()92 void testStepSize()
93 {
94     TIndex index("CATGATTACATA");
95     setStepSize(index, 2);
96     hash(indexShape(index), "CAT");
97     String<typename Position<DnaString>::Type> occs;
98     occs = getOccurrences(index, indexShape(index));
99     SEQAN_ASSERT_EQ(length(occs), 2u);
100     SEQAN_ASSERT_EQ(occs[0], 0u);
101     SEQAN_ASSERT_EQ(occs[1], 8u);
102 }
103 
SEQAN_DEFINE_TEST(testStepSize)104 SEQAN_DEFINE_TEST(testStepSize)
105 {
106     typedef Index<DnaString, IndexQGram< UngappedShape<3> > > TIndex1;
107     typedef Index<DnaString, IndexQGram< UngappedShape<3>, OpenAddressing > > TIndex2;
108 
109     testStepSize<TIndex1>();
110     testStepSize<TIndex2>();
111 }
112 
113 /*
114 void testQGramIndexSchnell()
115 {
116     clock_t start, finish;
117     double duration;
118 
119     String<Dna> text;
120     fstream strm_t;
121     strm_t.open(TEST_PATH "fasta.txt", ios_base::in);
122     read(strm_t, text, Fasta());
123     String<Dna> next;
124     resize(next,length(text));
125     for(int i = 0; i < 1; ++i)                            // datei zu ende?
126     {
127         arrayCopyForward(begin(text),end(text),begin(next));
128         append(text, next);
129     }
130     strm_t.close();
131 
132     String<char> shape_string = "1000100100001110011";
133     int q = length(shape_string);
134     Shape<Dna,GenericShape> shape;
135     stringToShape(shape, shape_string);
136 
137     typedef Position<String<Dna> >::Type TPosition;
138     String<TPosition> pos;
139     resize(pos, length(text) - q + 2);
140 
141     String<TPosition> dir;
142     int pos_size = _intPow((unsigned)ValueSize<Dna>::VALUE, weight(shape));
143     pos_size += 1;
144     resize(dir, pos_size);
145 
146     start = clock();
147     Nothing nothing;
148     createQGramIndex(pos, dir, nothing, text, shape, 1);
149     finish = clock();
150     duration = (double)(finish - start) / CLOCKS_PER_SEC;
151     //std::cout << "\nQGramIndex bauen dauert: " << duration << " Sekunden.\n\n";
152 
153 
154 }
155 */
156 /*
157 void testGappedQGramIndex()
158 {
159     String<Dna> text = "CTGAACCCTAAACCCT";
160     String<char> shape_string = "101";
161     int q = length(shape_string);
162     Shape<Dna,GenericShape> shape;
163     stringToShape(shape, shape_string);
164 
165     typedef Position<String<Dna> >::Type TPosition;
166     String<TPosition> pos;
167     resize(pos, length(text) - q + 2);
168 
169     String<TPosition> dir;
170     int pos_size = _intPow((unsigned)ValueSize<Dna>::VALUE, weight(shape));
171     pos_size += 1;
172     resize(dir, pos_size);
173 
174     Nothing nothing;
175     createQGramIndex(pos, dir, nothing, text, shape, 1);
176 
177     SEQAN_ASSERT(dir[0] == 0);
178     SEQAN_ASSERT(dir[1] == 1);
179     SEQAN_ASSERT(dir[2] == 5);
180     SEQAN_ASSERT(dir[3] == 5);
181     SEQAN_ASSERT(dir[4] == 5);
182     SEQAN_ASSERT(dir[5] == 6);
183     SEQAN_ASSERT(dir[6] == 8);
184     SEQAN_ASSERT(dir[7] == 9);
185     SEQAN_ASSERT(dir[8] == 11);
186     SEQAN_ASSERT(dir[9] == 12);
187     SEQAN_ASSERT(dir[10] == 12);
188     SEQAN_ASSERT(dir[11] == 12);
189     SEQAN_ASSERT(dir[12] == 12);
190     SEQAN_ASSERT(dir[13] == 14);
191     SEQAN_ASSERT(dir[14] == 14);
192     SEQAN_ASSERT(dir[15] == 14);
193     SEQAN_ASSERT(dir[16] == 14);
194 
195     SEQAN_ASSERT(pos[0] == 9);
196     SEQAN_ASSERT(pos[1] == 11);
197     SEQAN_ASSERT(pos[2] == 10);
198     SEQAN_ASSERT(pos[3] == 4);
199     SEQAN_ASSERT(pos[4] == 3);
200     SEQAN_ASSERT(pos[5] == 7);
201     SEQAN_ASSERT(pos[6] == 12);
202     SEQAN_ASSERT(pos[7] == 5);
203     SEQAN_ASSERT(pos[8] == 0);
204     SEQAN_ASSERT(pos[9] == 13);
205     SEQAN_ASSERT(pos[10] == 6);
206     SEQAN_ASSERT(pos[11] == 2);
207     SEQAN_ASSERT(pos[12] == 8);
208     SEQAN_ASSERT(pos[13] == 1);
209 
210 }
211 */
SEQAN_DEFINE_TEST(testUngappedQGramIndex)212 SEQAN_DEFINE_TEST(testUngappedQGramIndex)
213 {
214     typedef String<Dna> TString;
215     typedef Shape<Dna, UngappedShape<2> > TShape;
216     typedef Index<TString, IndexQGram<TShape> > TIndex;
217     typedef Position<TString>::Type TPosition;
218 
219     TString text("CTGAACCCTAAACCCT");
220 
221     TIndex index(text);
222     indexCreate(index, QGramSADir());
223 
224     String<TPosition> pos(getFibre(index, QGramSA()));
225     String<TPosition> dir(getFibre(index, QGramDir()));
226 
227     SEQAN_ASSERT(dir[0] == 0);
228     SEQAN_ASSERT(dir[1] == 3);
229     SEQAN_ASSERT(dir[2] == 5);
230     SEQAN_ASSERT(dir[3] == 5);
231     SEQAN_ASSERT(dir[4] == 5);
232     SEQAN_ASSERT(dir[5] == 5);
233     SEQAN_ASSERT(dir[6] == 9);
234     SEQAN_ASSERT(dir[7] == 9);
235     SEQAN_ASSERT(dir[8] == 12);
236     SEQAN_ASSERT(dir[9] == 13);
237     SEQAN_ASSERT(dir[10] == 13);
238     SEQAN_ASSERT(dir[11] == 13);
239     SEQAN_ASSERT(dir[12] == 13);
240     SEQAN_ASSERT(dir[13] == 14);
241     SEQAN_ASSERT(dir[14] == 14);
242     SEQAN_ASSERT(dir[15] == 15);
243 
244     SEQAN_ASSERT(pos[0] == 3);
245     SEQAN_ASSERT(pos[1] == 9);
246     SEQAN_ASSERT(pos[2] == 10);
247     SEQAN_ASSERT(pos[3] == 4);
248     SEQAN_ASSERT(pos[4] == 11);
249     SEQAN_ASSERT(pos[5] == 5);
250     SEQAN_ASSERT(pos[6] == 6);
251     SEQAN_ASSERT(pos[7] == 12);
252     SEQAN_ASSERT(pos[8] == 13);
253     SEQAN_ASSERT(pos[9] == 0);
254     SEQAN_ASSERT(pos[10] == 7);
255     SEQAN_ASSERT(pos[11] == 14);
256     SEQAN_ASSERT(pos[12] == 2);
257     SEQAN_ASSERT(pos[13] == 8);
258     SEQAN_ASSERT(pos[14] == 1);
259 }
260 
261 inline bool
_qgramDisableBuckets(Index<StringSet<DnaString>,IndexQGram<Shape<Dna,UngappedShape<3>>>> & index)262 _qgramDisableBuckets(Index<StringSet<DnaString>, IndexQGram<Shape<Dna, UngappedShape<3> > > > &index)
263 {
264     indexDir(index)[1] = -1;
265     return true;
266 }
267 
SEQAN_DEFINE_TEST(testUngappedQGramIndexMulti)268 SEQAN_DEFINE_TEST(testUngappedQGramIndexMulti)
269 {
270     typedef StringSet<DnaString>                    TStrings;
271     //typedef SAValue<TStrings>::Type                 TSAValue;
272     //typedef String<TSAValue>                        TPos;
273     typedef Shape<Dna, UngappedShape<3> >           TShape;
274     typedef Index<TStrings, IndexQGram<TShape> >    TIndex;
275 
276     TStrings strings;
277     TIndex refIndex(strings);
278     TIndex testIndex(strings);
279 
280                        //           111111
281                        // 0123456789012345
282     appendValue(strings, "CTGAACCCTAAACCCT");
283 //    appendValue(strings, "");                   // TODO: fix tupler to cope with strings smaller than q
284     appendValue(strings, "GAAGGAGTGTGTGT");     //       requires to adapt pipe interface to return limitsString
285 //    appendValue(strings, "GT");
286     appendValue(strings, "AAAACCCCAAACCCC");
287 
288     TShape &shape = indexShape(refIndex);
289     indexCreate(refIndex, QGramSADir());
290     resize(indexSA(refIndex), lengthSum(strings) - length(strings) * (length(shape) - 1));
291     resize(indexDir(refIndex), _intPow((unsigned)ValueSize<Dna>::VALUE, length(shape)) + 1);
292     createQGramIndex(refIndex);
293 
294     // test classic external index construction
295     resize(indexSA(testIndex), length(indexSA(refIndex)));
296     resize(indexDir(testIndex), length(indexDir(refIndex)));
297     createQGramIndexExt(testIndex);
298 
299     for (unsigned i = 0; i < length(indexDir(refIndex)); ++i)
300         SEQAN_ASSERT_EQ_MSG(dirAt(i, refIndex), dirAt(i, testIndex), "i is %d", i);
301     for (unsigned i = 0; i < length(indexSA(refIndex)); ++i)
302         SEQAN_ASSERT_EQ_MSG(saAt(i, refIndex), saAt(i, testIndex), "i is %d", i);
303 
304     clear(indexSA(testIndex));
305     clear(indexDir(testIndex));
306 
307     // test new external index construction
308     resize(indexSA(testIndex), length(indexSA(refIndex)));
309     resize(indexDir(testIndex), length(indexDir(refIndex)));
310     createQGramIndexExtSA(testIndex);
311 
312     for (unsigned i = 0; i < length(indexDir(refIndex)); ++i)
313         SEQAN_ASSERT_EQ_MSG(dirAt(i, refIndex), dirAt(i, testIndex), "i is %d", i);
314     for (unsigned i = 0; i < length(indexSA(refIndex)); ++i)
315         SEQAN_ASSERT_EQ_MSG(saAt(i, refIndex), saAt(i, testIndex), "i is %d", i);
316 }
317 
318 
319 //////////////////////////////////////////////////////////////////////////////
320 
SEQAN_DEFINE_TEST(testQGramFind)321 SEQAN_DEFINE_TEST(testQGramFind)
322 {
323     typedef Index<String<char>, IndexQGram<UngappedShape<2> > > TQGramIndex;
324     TQGramIndex idx("to be or not to be");
325     Finder<TQGramIndex> finder(idx);
326 
327     SEQAN_ASSERT(find(finder, "be"));
328     SEQAN_ASSERT(position(finder) == 3);
329     SEQAN_ASSERT(find(finder, "be"));
330     SEQAN_ASSERT(position(finder) == 16);
331     SEQAN_ASSERT(!find(finder, "be"));
332 /*
333     while (find(finder, "be"))
334     {
335         std::cout << position(finder) << "\n";
336     }
337 */
338 }
339 
340 //////////////////////////////////////////////////////////////////////////////
341 
342 
343 } //namespace seqan
344 
345 #endif //#ifndef SEQAN_HEADER_...
346