1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: David Weese <david.weese@fu-berlin.de>
33 // ==========================================================================
34
35 #ifndef TESTS_INDEX_TEST_QGRAM_INDEX_H
36 #define TESTS_INDEX_TEST_QGRAM_INDEX_H
37
38
39 //////////////////////////////////////////////////////////////////////////////
40
41 namespace seqan
42 {
43
SEQAN_DEFINE_TEST(testGappedShapes)44 SEQAN_DEFINE_TEST(testGappedShapes)
45 {
46 String<char> shape_string = "0010011011101";
47 Shape<Dna,GenericShape> shape1;
48 stringToShape(shape1, shape_string);
49 Shape<Dna,GenericShape> shape2 = Shape<Dna,GenericShape>(shape1);
50
51 SEQAN_ASSERT(shape1.weight == shape2.weight);
52 SEQAN_ASSERT(shape1.span == shape2.span);
53 SEQAN_ASSERT(shape1.diffs == shape2.diffs);
54 SEQAN_ASSERT(length(shape1) == length(shape2));
55 SEQAN_ASSERT(weight(shape1) == weight(shape2));
56 /*
57 Shape<Dna,GenericShape> shape3 = Shape<Dna,GenericShape>(5, 13);
58 shape3[0]=2;
59 shape3[1]=2;
60 shape3[2]=1;
61 shape3[3]=1;
62 shape3[4]=2;
63 shape3[5]=1;
64 shape3[6]=1;
65 shape3[7]=2;
66 for(int i = 0; i < 8; ++i)
67 SEQAN_ASSERT(shape1[i] == shape3[i]);
68 */
69 }
70
71
SEQAN_DEFINE_TEST(testUngappedShapes)72 SEQAN_DEFINE_TEST(testUngappedShapes)
73 {
74 Shape<Dna,SimpleShape> shape1;
75 resize(shape1, 4);
76 Shape<Dna,SimpleShape> shape2 = Shape<Dna,SimpleShape>(shape1);
77
78 SEQAN_ASSERT(shape1.span == shape2.span);
79 SEQAN_ASSERT(shape1.leftFactor == shape2.leftFactor);
80 SEQAN_ASSERT(length(shape1) == length(shape2));
81 SEQAN_ASSERT(weight(shape1) == weight(shape2));
82
83
84 Shape<Dna,SimpleShape> shape3 = Shape<Dna,SimpleShape>(4);
85 SEQAN_ASSERT(shape3.leftFactor == 64);
86 SEQAN_ASSERT(shape1.leftFactor == shape3.leftFactor);
87
88
89 }
90
91 template <typename TIndex>
testStepSize()92 void testStepSize()
93 {
94 TIndex index("CATGATTACATA");
95 setStepSize(index, 2);
96 hash(indexShape(index), "CAT");
97 String<typename Position<DnaString>::Type> occs;
98 occs = getOccurrences(index, indexShape(index));
99 SEQAN_ASSERT_EQ(length(occs), 2u);
100 SEQAN_ASSERT_EQ(occs[0], 0u);
101 SEQAN_ASSERT_EQ(occs[1], 8u);
102 }
103
SEQAN_DEFINE_TEST(testStepSize)104 SEQAN_DEFINE_TEST(testStepSize)
105 {
106 typedef Index<DnaString, IndexQGram< UngappedShape<3> > > TIndex1;
107 typedef Index<DnaString, IndexQGram< UngappedShape<3>, OpenAddressing > > TIndex2;
108
109 testStepSize<TIndex1>();
110 testStepSize<TIndex2>();
111 }
112
113 /*
114 void testQGramIndexSchnell()
115 {
116 clock_t start, finish;
117 double duration;
118
119 String<Dna> text;
120 fstream strm_t;
121 strm_t.open(TEST_PATH "fasta.txt", ios_base::in);
122 read(strm_t, text, Fasta());
123 String<Dna> next;
124 resize(next,length(text));
125 for(int i = 0; i < 1; ++i) // datei zu ende?
126 {
127 arrayCopyForward(begin(text),end(text),begin(next));
128 append(text, next);
129 }
130 strm_t.close();
131
132 String<char> shape_string = "1000100100001110011";
133 int q = length(shape_string);
134 Shape<Dna,GenericShape> shape;
135 stringToShape(shape, shape_string);
136
137 typedef Position<String<Dna> >::Type TPosition;
138 String<TPosition> pos;
139 resize(pos, length(text) - q + 2);
140
141 String<TPosition> dir;
142 int pos_size = _intPow((unsigned)ValueSize<Dna>::VALUE, weight(shape));
143 pos_size += 1;
144 resize(dir, pos_size);
145
146 start = clock();
147 Nothing nothing;
148 createQGramIndex(pos, dir, nothing, text, shape, 1);
149 finish = clock();
150 duration = (double)(finish - start) / CLOCKS_PER_SEC;
151 //std::cout << "\nQGramIndex bauen dauert: " << duration << " Sekunden.\n\n";
152
153
154 }
155 */
156 /*
157 void testGappedQGramIndex()
158 {
159 String<Dna> text = "CTGAACCCTAAACCCT";
160 String<char> shape_string = "101";
161 int q = length(shape_string);
162 Shape<Dna,GenericShape> shape;
163 stringToShape(shape, shape_string);
164
165 typedef Position<String<Dna> >::Type TPosition;
166 String<TPosition> pos;
167 resize(pos, length(text) - q + 2);
168
169 String<TPosition> dir;
170 int pos_size = _intPow((unsigned)ValueSize<Dna>::VALUE, weight(shape));
171 pos_size += 1;
172 resize(dir, pos_size);
173
174 Nothing nothing;
175 createQGramIndex(pos, dir, nothing, text, shape, 1);
176
177 SEQAN_ASSERT(dir[0] == 0);
178 SEQAN_ASSERT(dir[1] == 1);
179 SEQAN_ASSERT(dir[2] == 5);
180 SEQAN_ASSERT(dir[3] == 5);
181 SEQAN_ASSERT(dir[4] == 5);
182 SEQAN_ASSERT(dir[5] == 6);
183 SEQAN_ASSERT(dir[6] == 8);
184 SEQAN_ASSERT(dir[7] == 9);
185 SEQAN_ASSERT(dir[8] == 11);
186 SEQAN_ASSERT(dir[9] == 12);
187 SEQAN_ASSERT(dir[10] == 12);
188 SEQAN_ASSERT(dir[11] == 12);
189 SEQAN_ASSERT(dir[12] == 12);
190 SEQAN_ASSERT(dir[13] == 14);
191 SEQAN_ASSERT(dir[14] == 14);
192 SEQAN_ASSERT(dir[15] == 14);
193 SEQAN_ASSERT(dir[16] == 14);
194
195 SEQAN_ASSERT(pos[0] == 9);
196 SEQAN_ASSERT(pos[1] == 11);
197 SEQAN_ASSERT(pos[2] == 10);
198 SEQAN_ASSERT(pos[3] == 4);
199 SEQAN_ASSERT(pos[4] == 3);
200 SEQAN_ASSERT(pos[5] == 7);
201 SEQAN_ASSERT(pos[6] == 12);
202 SEQAN_ASSERT(pos[7] == 5);
203 SEQAN_ASSERT(pos[8] == 0);
204 SEQAN_ASSERT(pos[9] == 13);
205 SEQAN_ASSERT(pos[10] == 6);
206 SEQAN_ASSERT(pos[11] == 2);
207 SEQAN_ASSERT(pos[12] == 8);
208 SEQAN_ASSERT(pos[13] == 1);
209
210 }
211 */
SEQAN_DEFINE_TEST(testUngappedQGramIndex)212 SEQAN_DEFINE_TEST(testUngappedQGramIndex)
213 {
214 typedef String<Dna> TString;
215 typedef Shape<Dna, UngappedShape<2> > TShape;
216 typedef Index<TString, IndexQGram<TShape> > TIndex;
217 typedef Position<TString>::Type TPosition;
218
219 TString text("CTGAACCCTAAACCCT");
220
221 TIndex index(text);
222 indexCreate(index, QGramSADir());
223
224 String<TPosition> pos(getFibre(index, QGramSA()));
225 String<TPosition> dir(getFibre(index, QGramDir()));
226
227 SEQAN_ASSERT(dir[0] == 0);
228 SEQAN_ASSERT(dir[1] == 3);
229 SEQAN_ASSERT(dir[2] == 5);
230 SEQAN_ASSERT(dir[3] == 5);
231 SEQAN_ASSERT(dir[4] == 5);
232 SEQAN_ASSERT(dir[5] == 5);
233 SEQAN_ASSERT(dir[6] == 9);
234 SEQAN_ASSERT(dir[7] == 9);
235 SEQAN_ASSERT(dir[8] == 12);
236 SEQAN_ASSERT(dir[9] == 13);
237 SEQAN_ASSERT(dir[10] == 13);
238 SEQAN_ASSERT(dir[11] == 13);
239 SEQAN_ASSERT(dir[12] == 13);
240 SEQAN_ASSERT(dir[13] == 14);
241 SEQAN_ASSERT(dir[14] == 14);
242 SEQAN_ASSERT(dir[15] == 15);
243
244 SEQAN_ASSERT(pos[0] == 3);
245 SEQAN_ASSERT(pos[1] == 9);
246 SEQAN_ASSERT(pos[2] == 10);
247 SEQAN_ASSERT(pos[3] == 4);
248 SEQAN_ASSERT(pos[4] == 11);
249 SEQAN_ASSERT(pos[5] == 5);
250 SEQAN_ASSERT(pos[6] == 6);
251 SEQAN_ASSERT(pos[7] == 12);
252 SEQAN_ASSERT(pos[8] == 13);
253 SEQAN_ASSERT(pos[9] == 0);
254 SEQAN_ASSERT(pos[10] == 7);
255 SEQAN_ASSERT(pos[11] == 14);
256 SEQAN_ASSERT(pos[12] == 2);
257 SEQAN_ASSERT(pos[13] == 8);
258 SEQAN_ASSERT(pos[14] == 1);
259 }
260
261 inline bool
_qgramDisableBuckets(Index<StringSet<DnaString>,IndexQGram<Shape<Dna,UngappedShape<3>>>> & index)262 _qgramDisableBuckets(Index<StringSet<DnaString>, IndexQGram<Shape<Dna, UngappedShape<3> > > > &index)
263 {
264 indexDir(index)[1] = -1;
265 return true;
266 }
267
SEQAN_DEFINE_TEST(testUngappedQGramIndexMulti)268 SEQAN_DEFINE_TEST(testUngappedQGramIndexMulti)
269 {
270 typedef StringSet<DnaString> TStrings;
271 //typedef SAValue<TStrings>::Type TSAValue;
272 //typedef String<TSAValue> TPos;
273 typedef Shape<Dna, UngappedShape<3> > TShape;
274 typedef Index<TStrings, IndexQGram<TShape> > TIndex;
275
276 TStrings strings;
277 TIndex refIndex(strings);
278 TIndex testIndex(strings);
279
280 // 111111
281 // 0123456789012345
282 appendValue(strings, "CTGAACCCTAAACCCT");
283 // appendValue(strings, ""); // TODO: fix tupler to cope with strings smaller than q
284 appendValue(strings, "GAAGGAGTGTGTGT"); // requires to adapt pipe interface to return limitsString
285 // appendValue(strings, "GT");
286 appendValue(strings, "AAAACCCCAAACCCC");
287
288 TShape &shape = indexShape(refIndex);
289 indexCreate(refIndex, QGramSADir());
290 resize(indexSA(refIndex), lengthSum(strings) - length(strings) * (length(shape) - 1));
291 resize(indexDir(refIndex), _intPow((unsigned)ValueSize<Dna>::VALUE, length(shape)) + 1);
292 createQGramIndex(refIndex);
293
294 // test classic external index construction
295 resize(indexSA(testIndex), length(indexSA(refIndex)));
296 resize(indexDir(testIndex), length(indexDir(refIndex)));
297 createQGramIndexExt(testIndex);
298
299 for (unsigned i = 0; i < length(indexDir(refIndex)); ++i)
300 SEQAN_ASSERT_EQ_MSG(dirAt(i, refIndex), dirAt(i, testIndex), "i is %d", i);
301 for (unsigned i = 0; i < length(indexSA(refIndex)); ++i)
302 SEQAN_ASSERT_EQ_MSG(saAt(i, refIndex), saAt(i, testIndex), "i is %d", i);
303
304 clear(indexSA(testIndex));
305 clear(indexDir(testIndex));
306
307 // test new external index construction
308 resize(indexSA(testIndex), length(indexSA(refIndex)));
309 resize(indexDir(testIndex), length(indexDir(refIndex)));
310 createQGramIndexExtSA(testIndex);
311
312 for (unsigned i = 0; i < length(indexDir(refIndex)); ++i)
313 SEQAN_ASSERT_EQ_MSG(dirAt(i, refIndex), dirAt(i, testIndex), "i is %d", i);
314 for (unsigned i = 0; i < length(indexSA(refIndex)); ++i)
315 SEQAN_ASSERT_EQ_MSG(saAt(i, refIndex), saAt(i, testIndex), "i is %d", i);
316 }
317
318
319 //////////////////////////////////////////////////////////////////////////////
320
SEQAN_DEFINE_TEST(testQGramFind)321 SEQAN_DEFINE_TEST(testQGramFind)
322 {
323 typedef Index<String<char>, IndexQGram<UngappedShape<2> > > TQGramIndex;
324 TQGramIndex idx("to be or not to be");
325 Finder<TQGramIndex> finder(idx);
326
327 SEQAN_ASSERT(find(finder, "be"));
328 SEQAN_ASSERT(position(finder) == 3);
329 SEQAN_ASSERT(find(finder, "be"));
330 SEQAN_ASSERT(position(finder) == 16);
331 SEQAN_ASSERT(!find(finder, "be"));
332 /*
333 while (find(finder, "be"))
334 {
335 std::cout << position(finder) << "\n";
336 }
337 */
338 }
339
340 //////////////////////////////////////////////////////////////////////////////
341
342
343 } //namespace seqan
344
345 #endif //#ifndef SEQAN_HEADER_...
346