1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 /**
28 * Unit tests for search
29 */
30
31 #include <ktst/unit_test.hpp>
32 #include <kapp/main.h> /* KMain */
33
34 //#include <../libs/search/search-priv.h>
35 #include <search/nucstrstr.h>
36 #include <search/smith-waterman.h>
37
38 #include <stdexcept>
39 #include <limits>
40
41 #include <stdio.h>
42
43 #include "search-vdb.h"
44 #include <search/ref-variation.h>
45
46 #undef max
47
48 using namespace std;
49
50 static rc_t argsHandler(int argc, char* argv[]);
51 TEST_SUITE_WITH_ARGS_HANDLER(TestSuiteSearch, argsHandler);
52
argsHandler(int argc,char * argv[])53 static rc_t argsHandler(int argc, char* argv[]) {
54 Args* args = NULL;
55 rc_t rc = ArgsMakeAndHandle(&args, argc, argv, 0, NULL, 0);
56 ArgsWhack(args);
57 return rc;
58 }
59
trim_eol(char * psz)60 void trim_eol(char* psz)
61 {
62 size_t len = strlen(psz);
63 if (psz[len - 2] == '\n' || psz[len - 2] == '\r')
64 psz[len - 2] = '\0';
65 else if (psz[len - 1] == '\n' || psz[len - 1] == '\r')
66 psz[len - 1] = '\0';
67 }
68
69 #if 0
70 #include "PerfCounter.h"
71 #include <klib/checksum.h>
72
73 TEST_CASE(TempCRC)
74 {
75 std::cout << "Testing crc32 speed..." << std::endl;
76 size_t const size = (size_t)10 << 20;
77 char* buf = new char [size];
78 CPerfCounter counter( "CRC32" );
79
80 for ( size_t i = 0; i < size; ++i )
81 {
82 buf[i] = i;
83 }
84
85 size_t const ALIGN_BYTES = _ARCH_BITS / 8;
86
87 printf ("allocated %saligned (address=%p)\n", (int)((size_t)buf % ALIGN_BYTES) ? "un" : "", buf);
88
89 size_t offset = 2;
90 uint32_t crc32;
91
92 {
93 CPCount count(counter);
94 crc32 = ::CRC32 ( 0, buf + offset, size - offset );
95 }
96
97 printf ("Caclulated CRC32: 0x%08X (%.2lf MB/s)\n", crc32, (size >> 20)/counter.GetSeconds());
98
99 delete[]buf;
100 }
101 #endif
102
103 class AgrepFixture
104 {
105 public:
AgrepFixture()106 AgrepFixture()
107 : agrep_params ( 0 )
108 {
109 }
~AgrepFixture()110 ~AgrepFixture()
111 {
112 ::AgrepWhack ( agrep_params );
113 }
114
Setup(const char * pattern,AgrepFlags flags)115 rc_t Setup ( const char* pattern, AgrepFlags flags )
116 {
117 pattern_len = strlen ( pattern );
118 return ::AgrepMake ( & agrep_params, AGREP_MODE_ASCII | flags, pattern );
119 }
120
FindFirst(const string & text,int32_t threshold)121 bool FindFirst ( const string& text, int32_t threshold )
122 {
123 return ::AgrepFindFirst ( agrep_params, threshold, text . c_str (), text . size (), & match_info ) != 0;
124 }
125
FindBest(const string & text,int32_t threshold)126 bool FindBest ( const string& text, int32_t threshold )
127 {
128 return ::AgrepFindBest ( agrep_params, threshold, text . c_str (), text . size (), & match_info ) != 0;
129 }
130
131
132 public:
133 size_t pattern_len;
134 ::AgrepParams* agrep_params;
135 ::AgrepMatch match_info;
136 };
137
FIXTURE_TEST_CASE(AgrepDPTest,AgrepFixture)138 FIXTURE_TEST_CASE ( AgrepDPTest, AgrepFixture )
139 {
140 REQUIRE_RC ( Setup ( "MATCH", AGREP_ALG_DP ) );
141
142 // Complete match
143 {
144 REQUIRE ( FindFirst ( "MATCH", 0 ) );
145 REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
146 REQUIRE_EQ ( 0, match_info.position );
147 REQUIRE_EQ ( 0, match_info.score );
148 }
149
150 // Complete substring match
151 {
152 REQUIRE ( FindFirst ( "xxMATCHvv", 0 ) );
153 REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
154 REQUIRE_EQ ( 2, match_info.position );
155 REQUIRE_EQ ( 0, match_info.score );
156 }
157 // 1 Deletion
158 {
159 REQUIRE ( FindFirst ( "xxxMACHvv", 1 ) );
160 REQUIRE_EQ ( pattern_len - 1, (size_t)match_info.length );
161 REQUIRE_EQ ( 3, match_info.position );
162 REQUIRE_EQ ( 1, match_info.score );
163 }
164
165 // 2 Insertions
166 {
167 REQUIRE ( FindFirst ( "xxxMAdTCaHvv", 2 ) );
168 REQUIRE_EQ ( pattern_len + 2, (size_t)match_info.length );
169 REQUIRE_EQ ( 3, match_info.position );
170 REQUIRE_EQ ( 2, match_info.score );
171 }
172
173 // 3 Mismatches
174 {
175 REQUIRE ( FindFirst ( "xATxx", 5 ) );
176 REQUIRE_EQ ( pattern_len /*2*/, (size_t)match_info.length ); // FIXME: 2 looks correct here
177 REQUIRE_EQ ( 0, match_info.position ); // 1
178 REQUIRE_EQ ( 3, match_info.score ); // 2
179 }
180
181 // Best match
182 {
183 REQUIRE ( FindBest ( "MTCH__MITCH_MTACH_MATCH_MATCH", 1 ) );
184 REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
185 REQUIRE_EQ ( 18, match_info.position );
186 REQUIRE_EQ ( 0, match_info.score );
187 }
188 // First match
189 {
190 REQUIRE ( FindFirst ( "MTCH__MITCH_MTACH_MATCH_MATCH", 1 ) );
191 REQUIRE_EQ ( pattern_len - 1, (size_t)match_info.length );
192 REQUIRE_EQ ( 0, match_info.position );
193 REQUIRE_EQ ( 1, match_info.score );
194 }
195
196 // Match anything
197 {
198 // threshold >= pattern_len seems to specify that a complete mismatch is acceptable
199 // by implementation, the algorithm reports the result to be "found" at the tail portion of the reference string
200 // for this degenerate case, the expected behavior is not clear, so I'll just document the reality here:
201 const string text = "xyzvuwpiuuuu";
202 REQUIRE ( FindFirst ( "xyzvuwpiuuuu", pattern_len ) );
203 REQUIRE_EQ ( pattern_len + 1, (size_t)match_info.length );
204 REQUIRE_EQ ( text . size () - ( pattern_len + 1 ), (size_t)match_info.position );
205 REQUIRE_EQ ( pattern_len, (size_t)match_info.score );
206 }
207
208 // Not found
209 {
210 REQUIRE ( ! FindFirst ( "xyzvuwpiu", 4 ) );
211 }
212 }
213
FIXTURE_TEST_CASE(AgrepWumanberTest,AgrepFixture)214 FIXTURE_TEST_CASE ( AgrepWumanberTest, AgrepFixture )
215 {
216 REQUIRE_RC ( Setup ( "MATCH", AGREP_ALG_WUMANBER ) );
217
218 // Complete match
219 {
220 REQUIRE ( FindFirst ( "MATCH", 0 ) );
221 REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
222 REQUIRE_EQ ( 0, match_info.position );
223 REQUIRE_EQ ( 0, match_info.score );
224 }
225
226 // Complete substring match
227 {
228 REQUIRE ( FindFirst ( "xxMATCHvv", 0 ) );
229 REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
230 REQUIRE_EQ ( 2, match_info.position );
231 REQUIRE_EQ ( 0, match_info.score );
232 }
233 // 1 Deletion
234 {
235 REQUIRE ( FindFirst ( "xxxMACHvv", 1 ) );
236 REQUIRE_EQ ( pattern_len - 1, (size_t)match_info.length );
237 REQUIRE_EQ ( 3, match_info.position );
238 REQUIRE_EQ ( 1, match_info.score );
239 }
240
241 // 2 Insertions
242 {
243 REQUIRE ( FindFirst ( "xxxMAdTCaHvv", 2 ) );
244 REQUIRE_EQ ( pattern_len + 2, (size_t)match_info.length );
245 REQUIRE_EQ ( 3, match_info.position );
246 REQUIRE_EQ ( 2, match_info.score );
247 }
248
249 // 3 Mismatches
250 {
251 REQUIRE ( FindFirst ( "xATxx", 5 ) );
252 REQUIRE_EQ ( pattern_len /*2*/, (size_t)match_info.length ); // FIXME: 2 looks correct here
253 REQUIRE_EQ ( 0, match_info.position ); // 1
254 REQUIRE_EQ ( 3, match_info.score ); // 2
255 }
256
257 // Best match
258 {
259 REQUIRE ( FindBest ( "MTCH__MITCH_MTACH_MATCH_MATCH", 1 ) );
260 REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
261 REQUIRE_EQ ( 18, match_info.position );
262 REQUIRE_EQ ( 0, match_info.score );
263 }
264 // First match
265 {
266 REQUIRE ( FindFirst ( "MTCH__MITCH_MTACH_MATCH_MATCH", 1 ) );
267 REQUIRE_EQ ( pattern_len - 1, (size_t)match_info.length );
268 REQUIRE_EQ ( 0, match_info.position );
269 REQUIRE_EQ ( 1, match_info.score );
270 }
271
272 // Match anything
273 { // threshold >= pattern_len seems to specify that a complete mismatch is acceptable
274 const string text = "xyzvuwpiuuuu";
275 REQUIRE ( FindFirst ( text, pattern_len ) );
276 REQUIRE_EQ ( text . size (), (size_t)match_info.length );
277 REQUIRE_EQ ( 0, match_info.position );
278 REQUIRE_EQ ( pattern_len, (size_t)match_info.score );
279 }
280
281 // Not found
282 {
283 REQUIRE ( ! FindFirst ( "xyzvuwpiu", 4 ) );
284 }
285 }
286
FIXTURE_TEST_CASE(AgrepMyersTest,AgrepFixture)287 FIXTURE_TEST_CASE ( AgrepMyersTest, AgrepFixture )
288 {
289 REQUIRE_RC ( Setup ( "MATCH", AGREP_ALG_MYERS ) );
290
291 // Complete match
292 {
293 REQUIRE ( FindFirst ( "MATCH", 0 ) );
294 REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
295 REQUIRE_EQ ( 0, match_info.position );
296 REQUIRE_EQ ( 0, match_info.score );
297 }
298
299 // Complete substring match
300 {
301 REQUIRE ( FindFirst ( "xxMATCHvv", 0 ) );
302 REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
303 REQUIRE_EQ ( 2, match_info.position );
304 REQUIRE_EQ ( 0, match_info.score );
305 }
306 // 1 Deletion
307 {
308 REQUIRE ( FindFirst ( "xxxMACHvv", 1 ) );
309 REQUIRE_EQ ( pattern_len - 1, (size_t)match_info.length );
310 REQUIRE_EQ ( 3, match_info.position );
311 REQUIRE_EQ ( 1, match_info.score );
312 }
313
314 // 2 Insertions
315 {
316 REQUIRE ( FindFirst ( "xxxMAdTCaHvv", 2 ) );
317 // REQUIRE_EQ ( pattern_len + 2, (size_t)match_info.length );
318 REQUIRE_EQ ( pattern_len, (size_t)match_info.length ); //FIXME: different from other algorithms
319 REQUIRE_EQ ( 3, match_info.position );
320 REQUIRE_EQ ( 2, match_info.score );
321 }
322
323 // 3 Mismatches
324 {
325 REQUIRE ( FindFirst ( "xATxx", 5 ) );
326 REQUIRE_EQ ( 2, match_info.length );
327 REQUIRE_EQ ( 1, match_info.position );
328 REQUIRE_EQ ( 3, match_info.score );
329 }
330
331 // Best match
332 {
333 REQUIRE ( FindBest ( "MTCH__MITCH_MTACH_MATCH_MATCH", 1 ) );
334 REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
335 REQUIRE_EQ ( 18, match_info.position );
336 REQUIRE_EQ ( 0, match_info.score );
337 }
338 // First match
339 {
340 REQUIRE ( FindFirst ( "MTCH__MITCH_MTACH_MATCH_MATCH", 1 ) );
341 REQUIRE_EQ ( pattern_len - 1, (size_t)match_info.length );
342 REQUIRE_EQ ( 0, match_info.position );
343 REQUIRE_EQ ( 1, match_info.score );
344 }
345
346 // Match anything
347 { // threshold >= pattern_len seems to specify that a complete mismatch is acceptable
348 const string text = "xyzvuwpiuuuu";
349 REQUIRE ( FindFirst ( text, pattern_len ) );
350 // REQUIRE_EQ ( text . size (), (size_t)match_info.length );
351 REQUIRE_EQ ( 1, match_info.length ); //FIXME: different from other algorithms
352 REQUIRE_EQ ( 0, match_info.position );
353 REQUIRE_EQ ( pattern_len, (size_t)match_info.score );
354 }
355
356 // Not found
357 {
358 REQUIRE ( ! FindFirst ( "xyzvuwpiu", 4 ) );
359 }
360 }
361
FIXTURE_TEST_CASE(AgrepMyersUnltdTest,AgrepFixture)362 FIXTURE_TEST_CASE ( AgrepMyersUnltdTest, AgrepFixture )
363 {
364 REQUIRE_RC ( Setup ( "MATCH", AGREP_ALG_MYERS_UNLTD ) );
365
366 // Complete match
367 {
368 REQUIRE ( FindFirst ( "MATCH", 0 ) );
369 REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
370 REQUIRE_EQ ( 0, match_info.position );
371 REQUIRE_EQ ( 0, match_info.score );
372 }
373
374 // Complete substring match
375 {
376 REQUIRE ( FindFirst ( "xxMATCHvv", 0 ) );
377 REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
378 REQUIRE_EQ ( 2, match_info.position );
379 REQUIRE_EQ ( 0, match_info.score );
380 }
381 // 1 Deletion
382 {
383 REQUIRE ( FindFirst ( "xxxMACHvv", 1 ) );
384 REQUIRE_EQ ( pattern_len - 1, (size_t)match_info.length );
385 REQUIRE_EQ ( 3, match_info.position );
386 REQUIRE_EQ ( 1, match_info.score );
387 }
388
389 // 2 Insertions
390 {
391 REQUIRE ( FindFirst ( "xxxMAdTCaHvv", 2 ) );
392 // REQUIRE_EQ ( pattern_len + 2, (size_t)match_info.length );
393 REQUIRE_EQ ( (size_t)pattern_len - 1, (size_t)match_info.length ); //FIXME: different from other algorithms
394 REQUIRE_EQ ( (int32_t)3, match_info.position );
395 REQUIRE_EQ ( (int32_t)2, match_info.score );
396 }
397
398 // 3 Mismatches
399 {
400 REQUIRE ( FindFirst ( "xATxx", 5 ) );
401 REQUIRE_EQ ( 2, match_info.length );
402 REQUIRE_EQ ( 1, match_info.position );
403 REQUIRE_EQ ( 3, match_info.score );
404 }
405
406 // Best match
407 {
408 REQUIRE ( FindBest ( "MTCH__MITCH_MTACH_MATCH_MATCH", 1 ) );
409 REQUIRE ( (size_t)match_info.length == pattern_len );
410 REQUIRE ( match_info.position == 18 );
411 REQUIRE ( match_info.score == 0 );
412 }
413 // First match
414 {
415 REQUIRE ( FindFirst ( "MTCH__MITCH_MTACH_MATCH_MATCH", 1 ) );
416 REQUIRE ( (size_t)match_info.length == pattern_len - 1 );
417 REQUIRE ( match_info.position == 0 );
418 REQUIRE ( match_info.score == 1 );
419 }
420
421 // Match anything
422 { // threshold >= pattern_len seems to specify that a complete mismatch is acceptable
423 const string text = "xyzvuwpiuuuu";
424 REQUIRE ( FindFirst ( text, pattern_len ) );
425 // REQUIRE_EQ ( text . size (), (size_t)match_info.length );
426 REQUIRE_EQ ( 1, match_info.length ); //FIXME: different from other algorithms
427 REQUIRE_EQ ( 0, match_info.position );
428 REQUIRE_EQ ( pattern_len, (size_t)match_info.score );
429 }
430
431 // Not found
432 {
433 REQUIRE ( ! FindFirst ( "xyzvuwpiu", 4 ) );
434 }
435 }
436
437
TEST_CASE(SearchCompare)438 TEST_CASE(SearchCompare)
439 {
440 //std::cout << "This is search algorithm time comparison test" << std::endl << std::endl;
441 std::cout << "SearchCompare test is currently OFF" << std::endl;
442
443 /*
444 TimeTest("SRR067432", SearchTimeTest::ALGORITHM_AGREP);
445 TimeTest("SRR067432", SearchTimeTest::ALGORITHM_LCS_DP);
446 TimeTest("SRR067432", SearchTimeTest::ALGORITHM_NUCSTRSTR);
447 TimeTest("SRR067432", SearchTimeTest::ALGORITHM_LCS_SAIS);*/
448
449 //std::cout << std::endl << "Time comparison test acomplished" << std::endl;
450 }
451
TEST_CASE(TestCaseAgrep)452 TEST_CASE(TestCaseAgrep)
453 {
454 std::cout << "TestCaseAgrep test is currently OFF" << std::endl;
455 //VDBSearchTest::CFindLinker().Run(AGREP_MODE_ASCII|AGREP_ALG_MYERS, "SRR067432");//"SRR068408");
456 //VDBSearchTest::CFindLinker().Run(AGREP_MODE_ASCII|AGREP_ALG_MYERS, "SRR067408_extr");
457
458 /*
459 char const pszRunNamesFile[] = "454Runs.txt";
460 FILE* fRuns = fopen(pszRunNamesFile, "r");
461 if (!fRuns)
462 {
463 printf("Failed to open file: %s\n", pszRunNamesFile);
464 return;
465 }
466 bool bSkipping = true;
467 for (; !feof(fRuns); )
468 {
469 char szRuns[512] = "";
470 char* psz = fgets(szRuns, sizeof(szRuns), fRuns);
471 if (!psz)
472 break;
473 trim_eol(psz);
474 if (bSkipping && !strcmp(psz, "SRR067408"))
475 {
476 bSkipping = false;
477 }
478 if (bSkipping)
479 continue;
480 VDBSearchTest::CFindLinker().Run(AGREP_MODE_ASCII|AGREP_ALG_MYERS, psz);
481 }*/
482 }
483
484 // Fgrep
485
RunFgrep(FgrepFlags p_alg)486 static void RunFgrep ( FgrepFlags p_alg )
487 { // VDB-2669: creates uninitialized memory
488 Fgrep* fg;
489 const char* queries[] = { "RRRRRAH" };
490 if ( FgrepMake ( & fg, FGREP_MODE_ASCII | p_alg, queries, 1 ) ) // this used to leave uninitialized memory ...
491 throw logic_error ( "RunFgrep: FgrepMake() failed" );
492
493 const std::string str ( 1000, 'A' );
494 FgrepMatch matchinfo;
495 if ( 0 != FgrepFindFirst ( fg, str . data (), str . size (), & matchinfo ) ) // ... the use of which showed up in valgrind here, and sometimes caused a crash
496 throw logic_error ( "RunFgrep: FgrepFindFirst() found a false match" );
497
498 FgrepFree ( fg );
499 }
500
TEST_CASE(DumbGrep_Crash)501 TEST_CASE ( DumbGrep_Crash )
502 {
503 RunFgrep ( FGREP_ALG_DUMB );
504 }
505
TEST_CASE(BoyerMooreGrep_Crash)506 TEST_CASE ( BoyerMooreGrep_Crash )
507 {
508 RunFgrep ( FGREP_ALG_BOYERMOORE );
509 }
510
TEST_CASE(AhoGrep_Crash)511 TEST_CASE ( AhoGrep_Crash )
512 {
513 RunFgrep ( FGREP_ALG_AHOCORASICK );
514 }
515
516 // Smith-Waterman
517
518 class SmithWatermanFixture
519 {
520 public:
SmithWatermanFixture()521 SmithWatermanFixture()
522 : m_self ( 0 ),
523 m_pattern_len ( 0 )
524 {
525 }
~SmithWatermanFixture()526 ~SmithWatermanFixture()
527 {
528 ::SmithWatermanWhack ( m_self );
529 }
530
Setup(const string & p_query)531 rc_t Setup ( const string& p_query )
532 {
533 m_pattern_len = p_query . size ();
534 return ::SmithWatermanMake ( & m_self, p_query . data() );
535 }
536
FindFirst(const string & p_text,uint32_t p_threshold=numeric_limits<uint32_t>::max ())537 bool FindFirst ( const string& p_text, uint32_t p_threshold = numeric_limits<uint32_t>::max() )
538 {
539 return ::SmithWatermanFindFirst ( m_self, p_threshold, p_text . c_str (), p_text . size (), & m_match_info ) == 0;
540 }
541
542 public:
543 ::SmithWaterman* m_self;
544 size_t m_pattern_len;
545 ::SmithWatermanMatch m_match_info;
546 };
547
FIXTURE_TEST_CASE(SmithWatermanTest,SmithWatermanFixture)548 FIXTURE_TEST_CASE ( SmithWatermanTest, SmithWatermanFixture )
549 {
550 REQUIRE_RC ( Setup ( "MATCH" ) );
551
552 // VDB-3034: crash on query of length 0, when called right after initialization of the search object
553 {
554 REQUIRE(!FindFirst(""));
555 }
556
557 // threshold in FirstMatch varies from 0 (a complete mismatch is acceptable) to 2*m_pattern_len (perfect match)
558
559 // Complete match
560 {
561 REQUIRE ( FindFirst ( "MATCH" ) );
562 REQUIRE_EQ ( m_pattern_len, ( size_t ) m_match_info.length );
563 REQUIRE_EQ ( 0, m_match_info.position );
564 REQUIRE_EQ ( 2 * m_pattern_len, ( size_t ) m_match_info.score );
565 }
566 // Complete substring match
567 {
568 REQUIRE ( FindFirst ( "xxMATCHvv", 0 ) );
569 REQUIRE_EQ ( m_pattern_len, (size_t)m_match_info.length );
570 REQUIRE_EQ ( 2, m_match_info.position );
571 REQUIRE_EQ ( 2 * m_pattern_len, ( size_t ) m_match_info.score );
572 }
573 // 1 Deletion
574 {
575 REQUIRE ( FindFirst ( "xxxMACHvv", 7 ) );
576 REQUIRE_EQ ( m_pattern_len - 1, (size_t)m_match_info.length );
577 REQUIRE_EQ ( 3, m_match_info.position );
578 REQUIRE_EQ ( 7, m_match_info.score );
579 }
580
581 // 2 Insertions
582 {
583 REQUIRE ( FindFirst ( "xxxMAdTCaHvv", 8 ) );
584 REQUIRE_EQ ( m_pattern_len + 2, (size_t)m_match_info.length );
585 REQUIRE_EQ ( 3, m_match_info.position );
586 REQUIRE_EQ ( 8, m_match_info.score );
587 }
588
589 // 3 Mismatches
590 {
591 REQUIRE ( FindFirst ( "xATxx", 4 ) );
592 REQUIRE_EQ ( 2, m_match_info.length );
593 REQUIRE_EQ ( 1, m_match_info.position );
594 REQUIRE_EQ ( 4, m_match_info.score );
595 }
596
597 // First match
598 {
599 REQUIRE ( FindFirst ( "MTCH__MITCH_MTACH_MATCH_MATCH" ) );
600 REQUIRE_EQ ( m_pattern_len, (size_t)m_match_info.length );
601 REQUIRE_EQ ( 18, m_match_info.position );
602 REQUIRE_EQ ( 2 * m_pattern_len, (size_t)m_match_info.score );
603 }
604
605 // Match anything
606 { // threshold is from 0 (a complete mismatch is acceptable) to 2*m_pattern_len (perfect match)
607 const string text = "xyzvuwpiuuuu";
608 REQUIRE ( FindFirst ( text, 0 ) );
609 REQUIRE_EQ ( 0, m_match_info.length );
610 REQUIRE_EQ ( 0, m_match_info.position );
611 REQUIRE_EQ ( 0, m_match_info.score );
612 }
613
614 // Not found
615 {
616 REQUIRE ( ! FindFirst ( "xyzvuwpiu", 1 ) );
617 }
618
619 }
620
621 // Ref-Variation
622 #if 0
623 static
624 void
625 PrintMatrix ( const INSDC_dna_text* p_ref, const INSDC_dna_text* p_query, const int p_matrix[], size_t p_rows, size_t p_cols )
626 {
627 cout << " ";
628 while ( *p_query )
629 {
630 cout << " " << *p_query;
631 ++p_query;
632 }
633 cout << endl;
634 for ( size_t i = 0; i < p_rows - 1; ++i ) // skip row 0 ( all 0s )
635 {
636 cout << i << " " << p_ref[i] << ": ";
637 for ( size_t j = 0; j < p_cols - 1; ++j ) // skip the 0 at the start
638 {
639 cout << p_matrix [ ( i + 1 ) * p_cols + j + 1 ] << " ";
640 }
641 cout << endl;
642 }
643 }
644 #endif
645
TEST_CASE(RefVariation_crash)646 TEST_CASE ( RefVariation_crash)
647 {
648 RefVariation* self;
649 INSDC_dna_text ref[] = "ACGTACGTACGTACGTACGTACGTACGTACGT";
650 REQUIRE_RC_FAIL ( RefVariationIUPACMake ( &self, ref, string_size ( ref ), 0, 0, "", 0, ::refvarAlgSW ) );
651 }
652
653 #ifdef _WIN32
654 #define PRSIZET "I"
655 #else
656 #define PRSIZET "z"
657 #endif
658
print_refvar_obj(::RefVariation const * obj)659 string print_refvar_obj (::RefVariation const* obj)
660 {
661 size_t allele_len = 0, allele_start = 0, allele_len_on_ref = 0;
662 char const* allele = NULL;
663 ::RefVariationGetAllele( obj, & allele, & allele_len, & allele_start );
664 ::RefVariationGetAlleleLenOnRef ( obj, & allele_len_on_ref );
665
666 //printf ("<no ref name>:%"PRSIZET"u:%"PRSIZET"u:%.*s\n", allele_start, allele_len_on_ref, (int)allele_len, allele);
667
668 return string ( allele, allele_len );
669 }
670
671 #undef PRSIZET
672
673
vrefvar_bounds(::RefVarAlg alg,char const * ref,size_t ref_len,size_t pos,size_t len_on_ref,char const * query,size_t query_len)674 string vrefvar_bounds (::RefVarAlg alg, char const* ref,
675 size_t ref_len, size_t pos, size_t len_on_ref,
676 char const* query, size_t query_len)
677 {
678
679 ::RefVariation* obj;
680
681 rc_t rc = ::RefVariationIUPACMake ( & obj, ref, ref_len, pos, len_on_ref, query, query_len, alg );
682
683 string ret = print_refvar_obj ( obj );
684
685 if ( rc == 0 )
686 ::RefVariationRelease( obj );
687
688 return ret;
689 }
690
vrefvar_bounds_n(::RefVarAlg alg)691 string vrefvar_bounds_n(::RefVarAlg alg)
692 {
693 // 01234567890123456789
694 char const ref[] = "NNNNNNNNNNTAACCCTAAC";
695 // CCCCTTAGG-
696
697 size_t pos = 5, len_on_ref = 10;
698 char const query[] = "CCCCTTAGG";
699
700 return vrefvar_bounds ( alg, ref, strlen(ref), pos, len_on_ref, query, strlen(query) );
701 }
702
vrefvar_bounds_0(::RefVarAlg alg)703 string vrefvar_bounds_0(::RefVarAlg alg)
704 {
705 // 01234567890123456789
706 char const ref[] = "TAACCCTAAC";
707 // TTAGG-
708
709 size_t pos = 0, len_on_ref = 5;
710 char const query[] = "TAGG";
711
712 return vrefvar_bounds ( alg, ref, strlen(ref), pos, len_on_ref, query, strlen(query) );
713 }
714
vrefvar_bounds_N0(::RefVarAlg alg)715 string vrefvar_bounds_N0(::RefVarAlg alg)
716 {
717 // 01234567890123456789
718 char const ref[] = "NNNNNTAACCCTAAC";
719 // CCCCTTTAGG-
720
721 size_t pos = 0, len_on_ref = 10;
722 char const query[] = "CCCCTTAGG";
723
724 return vrefvar_bounds ( alg, ref, strlen(ref), pos, len_on_ref, query, strlen(query) );
725 }
726
TEST_CASE(RefVariation_bounds_N)727 TEST_CASE ( RefVariation_bounds_N )
728 {
729 //printf ("TODO: this test is derived from the real example which hangs up now (2015-12-14):\n");
730 //printf ("echo \"67068302 NC_000001.10:9995:10:CCCCTTAGG\" | var-expand --algorithm=sw\n");
731
732 REQUIRE_EQ ( string ( "NNNNNCCCCTTAGGCTAA" ), vrefvar_bounds_n ( ::refvarAlgSW ) );
733 REQUIRE_EQ ( string ( "CCCCTTAGGC" ), vrefvar_bounds_n ( ::refvarAlgRA ) );
734
735 REQUIRE_EQ ( string ( "AGGC" ), vrefvar_bounds_0 ( ::refvarAlgSW ) );
736 REQUIRE_EQ ( string ( "TAGG" ), vrefvar_bounds_0 ( ::refvarAlgRA ) );
737
738 REQUIRE_EQ ( string ( "CCCCTTAGGCTAA" ), vrefvar_bounds_N0 ( ::refvarAlgSW ) );
739 REQUIRE_EQ ( string ( "CCCCTTAGGC" ), vrefvar_bounds_N0 ( ::refvarAlgRA ) );
740 }
741
742 // Nucstrstr
743 static
744 void
ConvertAsciiTo2NAPacked(const string & p_read,unsigned char * pBuf2NA,size_t nBuf2NASize)745 ConvertAsciiTo2NAPacked ( const string& p_read, unsigned char* pBuf2NA, size_t nBuf2NASize )
746 {
747 static unsigned char map [ 1 << ( sizeof ( char ) * 8 ) ];
748 map['A'] = map['a'] = 0;
749 map['C'] = map['c'] = 1;
750 map['G'] = map['g'] = 2;
751 map['T'] = map['t'] = 3;
752
753 static size_t shiftLeft [ 4 ] = { 6, 4, 2, 0 };
754
755 fill ( pBuf2NA, pBuf2NA + nBuf2NASize, 0 );
756
757 for ( size_t iChar = 0; iChar < p_read . size (); ++iChar )
758 {
759 size_t iByte = iChar / 4;
760 if ( iByte > nBuf2NASize )
761 {
762 assert ( false );
763 break;
764 }
765
766 pBuf2NA[iByte] |= map [ size_t ( p_read [ iChar ] ) ] << shiftLeft [ iChar % 4 ];
767 }
768 }
769
770 static
771 int
RunNucStrtr(const string & p_ref,const string & p_query,bool p_positional)772 RunNucStrtr ( const string& p_ref, const string& p_query, bool p_positional )
773 {
774 unsigned char buf2na [ 1024 ];
775 ConvertAsciiTo2NAPacked ( p_ref, buf2na, sizeof ( buf2na ) );
776
777 NucStrstr *nss;
778 if ( NucStrstrMake ( & nss, p_positional ? 1 : 0, p_query . c_str (), p_query . size () ) != 0 )
779 throw logic_error ( "RunNucStrtr: NucStrstrMake() failed" );
780 unsigned int selflen = 0u;
781 // NB: for now, all searches start are from the beginning ( nucstrstr.h says "may be >= 4"; not sure what that means )
782 const unsigned int pos = 0;
783 int ret = NucStrstrSearch ( nss, ( const void * ) buf2na, pos, p_ref . size () - pos, & selflen );
784 NucStrstrWhack ( nss );
785 return ret;
786 }
787
TEST_CASE(Nucstrstr_NonPositional_NotFound)788 TEST_CASE ( Nucstrstr_NonPositional_NotFound )
789 {
790 REQUIRE_EQ ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "ACTA", false ) );
791 }
792
TEST_CASE(Nucstrstr_NonPositional_Found_AtStart)793 TEST_CASE ( Nucstrstr_NonPositional_Found_AtStart )
794 {
795 REQUIRE_NE ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "ACGTACGTACG", false ) );
796 }
797
TEST_CASE(Nucstrstr_NonPositional_Found_InMiddle)798 TEST_CASE ( Nucstrstr_NonPositional_Found_InMiddle )
799 {
800 REQUIRE_NE ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "GTACGTACG", false ) );
801 }
802
TEST_CASE(Nucstrstr_Positional_NoExpr_NotFound)803 TEST_CASE ( Nucstrstr_Positional_NoExpr_NotFound )
804 {
805 REQUIRE_EQ ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "ACCGT", true ) );
806 }
807
TEST_CASE(Nucstrstr_Positional_NoExpr_Found_AtStart)808 TEST_CASE ( Nucstrstr_Positional_NoExpr_Found_AtStart )
809 {
810 REQUIRE_EQ ( 1, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "ACGTACGTACG", true ) );
811 }
812
TEST_CASE(Nucstrstr_Positional_NoExpr_Found_InMiddle)813 TEST_CASE ( Nucstrstr_Positional_NoExpr_Found_InMiddle )
814 {
815 REQUIRE_EQ ( 4, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "TACGTACG", true ) );
816 }
817
TEST_CASE(Nucstrstr_Positional_Not_False)818 TEST_CASE ( Nucstrstr_Positional_Not_False )
819 {
820 REQUIRE_EQ ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "!TACGTACG", true ) );
821 }
TEST_CASE(Nucstrstr_Positional_Not_True)822 TEST_CASE ( Nucstrstr_Positional_Not_True )
823 { // "!" returns only 0 or 1, not the position
824 REQUIRE_EQ ( 1, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "!AG", true ) );
825 }
826
TEST_CASE(Nucstrstr_Positional_Caret_Found)827 TEST_CASE ( Nucstrstr_Positional_Caret_Found )
828 {
829 REQUIRE_EQ ( 1, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "^AC", true ) );
830 }
TEST_CASE(Nucstrstr_Positional_Caret_NotFound)831 TEST_CASE ( Nucstrstr_Positional_Caret_NotFound )
832 {
833 REQUIRE_EQ ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "^C", true ) );
834 }
835
TEST_CASE(Nucstrstr_Positional_Dollar_Found)836 TEST_CASE ( Nucstrstr_Positional_Dollar_Found )
837 {
838 REQUIRE_EQ ( 33, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGTTGCA", "TGCA$", true ) );
839 }
TEST_CASE(Nucstrstr_Positional_Dollar_NotFound)840 TEST_CASE ( Nucstrstr_Positional_Dollar_NotFound )
841 {
842 REQUIRE_EQ ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGTTGCAA", "TGCA$", true ) );
843 }
844
TEST_CASE(Nucstrstr_Positional_OR_Found_1)845 TEST_CASE ( Nucstrstr_Positional_OR_Found_1 )
846 {
847 REQUIRE_EQ ( 4, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGTTGCA", "TGAA|TACG", true ) );
848 }
TEST_CASE(Nucstrstr_Positional_OR_Found_2)849 TEST_CASE ( Nucstrstr_Positional_OR_Found_2 )
850 {
851 REQUIRE_EQ ( 4, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGTTGCA", "TGAA||TACG", true ) );
852 }
TEST_CASE(Nucstrstr_Positional_OR_Found_FirstMatchPositionReported)853 TEST_CASE ( Nucstrstr_Positional_OR_Found_FirstMatchPositionReported )
854 {
855 REQUIRE_EQ ( 2, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGTTGCA", "TGAA|CGT|TACG", true ) );
856 }
TEST_CASE(Nucstrstr_Positional_OR_NotFound)857 TEST_CASE ( Nucstrstr_Positional_OR_NotFound )
858 {
859 REQUIRE_EQ ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGTTGCA", "TGAA|TACA", true ) );
860 }
861
TEST_CASE(Nucstrstr_Positional_AND_Found_LastMatchPositionReported)862 TEST_CASE ( Nucstrstr_Positional_AND_Found_LastMatchPositionReported )
863 {
864 REQUIRE_EQ ( 4, RunNucStrtr ( "ACGTACGT", "CGTA&TACG", true ) );
865 }
TEST_CASE(Nucstrstr_Positional_AND_NotFound)866 TEST_CASE ( Nucstrstr_Positional_AND_NotFound )
867 {
868 REQUIRE_EQ ( 0, RunNucStrtr ( "ACGTACGT", "TACG&TACA", true ) );
869 }
870
TEST_CASE(Nucstrstr_Positional_AND_OR_Found_1)871 TEST_CASE ( Nucstrstr_Positional_AND_OR_Found_1 )
872 {
873 REQUIRE_EQ ( 3, RunNucStrtr ( "ACGTACGT", "CGTA&TACC|GTAC", true ) );
874 }
TEST_CASE(Nucstrstr_Positional_AND_OR_Found_2)875 TEST_CASE ( Nucstrstr_Positional_AND_OR_Found_2 )
876 {
877 REQUIRE_EQ ( 3, RunNucStrtr ( "ACGTACGT", "CGTA&(TACC|GTAC)", true ) );
878 }
TEST_CASE(Nucstrstr_Positional_AND_OR_Found_3)879 TEST_CASE ( Nucstrstr_Positional_AND_OR_Found_3 )
880 {
881 REQUIRE_EQ ( 2, RunNucStrtr ( "ACGTACGT", "(TACC|GTAC)&&CGTA", true ) );
882 }
883
TEST_CASE(Nucstrstr_Positional_AND_OR_NOT_NotFound)884 TEST_CASE ( Nucstrstr_Positional_AND_OR_NOT_NotFound )
885 {
886 REQUIRE_EQ ( 0, RunNucStrtr ( "ACGTACGT", "(TACC|GTAC)&&!CGTA", true ) );
887 }
TEST_CASE(Nucstrstr_Positional_AND_OR_NOT_Found)888 TEST_CASE ( Nucstrstr_Positional_AND_OR_NOT_Found )
889 {
890 REQUIRE_EQ ( 1, RunNucStrtr ( "ACGTACGT", "(TACC|GTAC)&&!CATA", true ) );
891 }
892
TEST_CASE(Nucstrstr_Positional_Error)893 TEST_CASE ( Nucstrstr_Positional_Error )
894 {
895 REQUIRE_THROW ( RunNucStrtr ( "ACGTACGT", "(TACC", true ) );
896 }
897
TEST_CASE(Nucstrstr_NonPositional_4NA_NotFound)898 TEST_CASE ( Nucstrstr_NonPositional_4NA_NotFound )
899 {
900 REQUIRE_EQ ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "ACTN", false ) );
901 }
902
TEST_CASE(Nucstrstr_NonPositional_4NA_Found_AtStart)903 TEST_CASE ( Nucstrstr_NonPositional_4NA_Found_AtStart )
904 {
905 REQUIRE_NE ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "ACGTACGTACN", false ) );
906 }
907
TEST_CASE(Nucstrstr_NonPositional_4NA_Found_InMiddle)908 TEST_CASE ( Nucstrstr_NonPositional_4NA_Found_InMiddle )
909 {
910 REQUIRE_NE ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "GTACGTACN", false ) );
911 }
912
913
914
915 //////////////////////////////////////////// Main
916 extern "C"
917 {
918
919 #include <kapp/args.h>
920 #include <kfg/config.h>
921
KAppVersion(void)922 ver_t CC KAppVersion ( void )
923 {
924 return 0x1000000;
925 }
UsageSummary(const char * progname)926 rc_t CC UsageSummary (const char * progname)
927 {
928 return 0;
929 }
930
Usage(const Args * args)931 rc_t CC Usage ( const Args * args )
932 {
933 return 0;
934 }
935
936 const char UsageDefaultName[] = "test-search";
937
KMain(int argc,char * argv[])938 rc_t CC KMain ( int argc, char *argv [] )
939 {
940 KConfigDisableUserSettings();
941 rc_t rc = TestSuiteSearch(argc, argv);
942 return rc;
943 }
944
945 } // end of extern "C"
946