1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 
27 /**
28 * Unit tests for search
29 */
30 
31 #include <ktst/unit_test.hpp>
32 #include <kapp/main.h> /* KMain */
33 
34 //#include <../libs/search/search-priv.h>
35 #include <search/nucstrstr.h>
36 #include <search/smith-waterman.h>
37 
38 #include <stdexcept>
39 #include <limits>
40 
41 #include <stdio.h>
42 
43 #include "search-vdb.h"
44 #include <search/ref-variation.h>
45 
46 #undef max
47 
48 using namespace std;
49 
50 static rc_t argsHandler(int argc, char* argv[]);
51 TEST_SUITE_WITH_ARGS_HANDLER(TestSuiteSearch, argsHandler);
52 
argsHandler(int argc,char * argv[])53 static rc_t argsHandler(int argc, char* argv[]) {
54     Args* args = NULL;
55     rc_t rc = ArgsMakeAndHandle(&args, argc, argv, 0, NULL, 0);
56     ArgsWhack(args);
57     return rc;
58 }
59 
trim_eol(char * psz)60 void trim_eol(char* psz)
61 {
62     size_t len = strlen(psz);
63     if (psz[len - 2] == '\n' || psz[len - 2] == '\r')
64         psz[len - 2] = '\0';
65     else if (psz[len - 1] == '\n' || psz[len - 1] == '\r')
66         psz[len - 1] = '\0';
67 }
68 
69 #if 0
70 #include "PerfCounter.h"
71 #include <klib/checksum.h>
72 
73 TEST_CASE(TempCRC)
74 {
75     std::cout << "Testing crc32 speed..." << std::endl;
76     size_t const size = (size_t)10 << 20;
77     char* buf = new char [size];
78     CPerfCounter counter( "CRC32" );
79 
80     for ( size_t i = 0; i < size; ++i )
81     {
82         buf[i] = i;
83     }
84 
85     size_t const ALIGN_BYTES = _ARCH_BITS / 8;
86 
87     printf ("allocated %saligned (address=%p)\n", (int)((size_t)buf % ALIGN_BYTES) ? "un" : "", buf);
88 
89     size_t offset = 2;
90     uint32_t crc32;
91 
92     {
93         CPCount count(counter);
94         crc32 = ::CRC32 ( 0, buf + offset, size - offset );
95     }
96 
97     printf ("Caclulated CRC32: 0x%08X (%.2lf MB/s)\n", crc32, (size >> 20)/counter.GetSeconds());
98 
99     delete[]buf;
100 }
101 #endif
102 
103 class AgrepFixture
104 {
105 public:
AgrepFixture()106 	AgrepFixture()
107 	: agrep_params ( 0 )
108 	{
109 	}
~AgrepFixture()110 	~AgrepFixture()
111 	{
112 		::AgrepWhack ( agrep_params );
113 	}
114 
Setup(const char * pattern,AgrepFlags flags)115 	rc_t Setup ( const char* pattern, AgrepFlags flags )
116 	{
117 		pattern_len = strlen ( pattern );
118 		return ::AgrepMake ( & agrep_params, AGREP_MODE_ASCII | flags, pattern );
119 	}
120 
FindFirst(const string & text,int32_t threshold)121 	bool FindFirst ( const string& text, int32_t threshold )
122 	{
123 		return ::AgrepFindFirst ( agrep_params, threshold, text . c_str (), text . size (), & match_info ) != 0;
124 	}
125 
FindBest(const string & text,int32_t threshold)126 	bool FindBest ( const string& text, int32_t threshold )
127 	{
128 		return ::AgrepFindBest ( agrep_params, threshold, text . c_str (), text . size (), & match_info ) != 0;
129 	}
130 
131 
132 public:
133 	size_t pattern_len;
134 	::AgrepParams* agrep_params;
135     ::AgrepMatch match_info;
136 };
137 
FIXTURE_TEST_CASE(AgrepDPTest,AgrepFixture)138 FIXTURE_TEST_CASE ( AgrepDPTest, AgrepFixture )
139 {
140     REQUIRE_RC ( Setup ( "MATCH", AGREP_ALG_DP ) );
141 
142     // Complete match
143     {
144         REQUIRE ( FindFirst ( "MATCH", 0 ) );
145         REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
146         REQUIRE_EQ ( 0, match_info.position );
147         REQUIRE_EQ ( 0, match_info.score );
148     }
149 
150     // Complete substring match
151     {
152         REQUIRE ( FindFirst ( "xxMATCHvv", 0 ) );
153         REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
154         REQUIRE_EQ ( 2, match_info.position );
155         REQUIRE_EQ ( 0, match_info.score );
156     }
157     // 1 Deletion
158     {
159         REQUIRE ( FindFirst ( "xxxMACHvv", 1 ) );
160         REQUIRE_EQ ( pattern_len - 1, (size_t)match_info.length );
161         REQUIRE_EQ ( 3, match_info.position );
162         REQUIRE_EQ ( 1, match_info.score );
163     }
164 
165     // 2 Insertions
166     {
167         REQUIRE ( FindFirst ( "xxxMAdTCaHvv", 2 ) );
168         REQUIRE_EQ ( pattern_len + 2, (size_t)match_info.length );
169         REQUIRE_EQ ( 3, match_info.position );
170         REQUIRE_EQ ( 2, match_info.score );
171     }
172 
173     // 3 Mismatches
174     {
175         REQUIRE ( FindFirst ( "xATxx", 5 ) );
176         REQUIRE_EQ ( pattern_len /*2*/, (size_t)match_info.length ); // FIXME: 2 looks correct here
177         REQUIRE_EQ ( 0, match_info.position );                       // 1
178         REQUIRE_EQ ( 3, match_info.score );                          // 2
179     }
180 
181     // Best match
182     {
183         REQUIRE ( FindBest ( "MTCH__MITCH_MTACH_MATCH_MATCH", 1 ) );
184         REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
185         REQUIRE_EQ ( 18, match_info.position );
186         REQUIRE_EQ ( 0, match_info.score );
187     }
188     // First match
189     {
190         REQUIRE ( FindFirst ( "MTCH__MITCH_MTACH_MATCH_MATCH", 1 ) );
191         REQUIRE_EQ ( pattern_len - 1, (size_t)match_info.length );
192         REQUIRE_EQ ( 0, match_info.position );
193         REQUIRE_EQ ( 1, match_info.score );
194     }
195 
196     // Match anything
197     {
198         // threshold >= pattern_len seems to specify that a complete mismatch is acceptable
199         // by implementation, the algorithm reports the result to be "found" at the tail portion of the reference string
200         // for this degenerate case, the expected behavior is not clear, so I'll just document the reality here:
201         const string text = "xyzvuwpiuuuu";
202         REQUIRE ( FindFirst ( "xyzvuwpiuuuu", pattern_len ) );
203         REQUIRE_EQ ( pattern_len + 1, (size_t)match_info.length );
204         REQUIRE_EQ ( text . size () - ( pattern_len + 1 ), (size_t)match_info.position );
205         REQUIRE_EQ ( pattern_len, (size_t)match_info.score );
206     }
207 
208     // Not found
209     {
210         REQUIRE ( ! FindFirst ( "xyzvuwpiu", 4 ) );
211     }
212 }
213 
FIXTURE_TEST_CASE(AgrepWumanberTest,AgrepFixture)214 FIXTURE_TEST_CASE ( AgrepWumanberTest, AgrepFixture )
215 {
216     REQUIRE_RC ( Setup ( "MATCH", AGREP_ALG_WUMANBER ) );
217 
218     // Complete match
219     {
220         REQUIRE ( FindFirst ( "MATCH", 0 ) );
221         REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
222         REQUIRE_EQ ( 0, match_info.position );
223         REQUIRE_EQ ( 0, match_info.score );
224     }
225 
226     // Complete substring match
227     {
228         REQUIRE ( FindFirst ( "xxMATCHvv", 0 ) );
229         REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
230         REQUIRE_EQ ( 2, match_info.position );
231         REQUIRE_EQ ( 0, match_info.score );
232     }
233     // 1 Deletion
234     {
235         REQUIRE ( FindFirst ( "xxxMACHvv", 1 ) );
236         REQUIRE_EQ ( pattern_len - 1, (size_t)match_info.length );
237         REQUIRE_EQ ( 3, match_info.position );
238         REQUIRE_EQ ( 1, match_info.score );
239     }
240 
241     // 2 Insertions
242     {
243         REQUIRE ( FindFirst ( "xxxMAdTCaHvv", 2 ) );
244         REQUIRE_EQ ( pattern_len + 2, (size_t)match_info.length );
245         REQUIRE_EQ ( 3, match_info.position );
246         REQUIRE_EQ ( 2, match_info.score );
247     }
248 
249     // 3 Mismatches
250     {
251         REQUIRE ( FindFirst ( "xATxx", 5 ) );
252         REQUIRE_EQ ( pattern_len /*2*/, (size_t)match_info.length ); // FIXME: 2 looks correct here
253         REQUIRE_EQ ( 0, match_info.position );                       // 1
254         REQUIRE_EQ ( 3, match_info.score );                          // 2
255     }
256 
257     // Best match
258     {
259         REQUIRE ( FindBest ( "MTCH__MITCH_MTACH_MATCH_MATCH", 1 ) );
260         REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
261         REQUIRE_EQ ( 18, match_info.position );
262         REQUIRE_EQ ( 0, match_info.score );
263     }
264     // First match
265     {
266         REQUIRE ( FindFirst ( "MTCH__MITCH_MTACH_MATCH_MATCH", 1 ) );
267         REQUIRE_EQ ( pattern_len - 1, (size_t)match_info.length );
268         REQUIRE_EQ ( 0, match_info.position );
269         REQUIRE_EQ ( 1, match_info.score );
270     }
271 
272     // Match anything
273     {   // threshold >= pattern_len seems to specify that a complete mismatch is acceptable
274         const string text = "xyzvuwpiuuuu";
275         REQUIRE ( FindFirst ( text, pattern_len ) );
276         REQUIRE_EQ ( text . size (), (size_t)match_info.length );
277         REQUIRE_EQ ( 0, match_info.position );
278         REQUIRE_EQ ( pattern_len, (size_t)match_info.score );
279     }
280 
281     // Not found
282     {
283         REQUIRE ( ! FindFirst ( "xyzvuwpiu", 4 ) );
284     }
285 }
286 
FIXTURE_TEST_CASE(AgrepMyersTest,AgrepFixture)287 FIXTURE_TEST_CASE ( AgrepMyersTest, AgrepFixture )
288 {
289     REQUIRE_RC ( Setup ( "MATCH", AGREP_ALG_MYERS ) );
290 
291     // Complete match
292     {
293         REQUIRE ( FindFirst ( "MATCH", 0 ) );
294         REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
295         REQUIRE_EQ ( 0, match_info.position );
296         REQUIRE_EQ ( 0, match_info.score );
297     }
298 
299     // Complete substring match
300     {
301         REQUIRE ( FindFirst ( "xxMATCHvv", 0 ) );
302         REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
303         REQUIRE_EQ ( 2, match_info.position );
304         REQUIRE_EQ ( 0, match_info.score );
305     }
306     // 1 Deletion
307     {
308         REQUIRE ( FindFirst ( "xxxMACHvv", 1 ) );
309         REQUIRE_EQ ( pattern_len - 1, (size_t)match_info.length );
310         REQUIRE_EQ ( 3, match_info.position );
311         REQUIRE_EQ ( 1, match_info.score );
312     }
313 
314     // 2 Insertions
315     {
316         REQUIRE ( FindFirst ( "xxxMAdTCaHvv", 2 ) );
317 //        REQUIRE_EQ ( pattern_len + 2, (size_t)match_info.length );
318         REQUIRE_EQ ( pattern_len, (size_t)match_info.length ); //FIXME: different from other algorithms
319         REQUIRE_EQ ( 3, match_info.position );
320         REQUIRE_EQ ( 2, match_info.score );
321     }
322 
323     // 3 Mismatches
324     {
325         REQUIRE ( FindFirst ( "xATxx", 5 ) );
326         REQUIRE_EQ ( 2, match_info.length  );
327         REQUIRE_EQ ( 1, match_info.position );
328         REQUIRE_EQ ( 3, match_info.score );
329     }
330 
331     // Best match
332     {
333         REQUIRE ( FindBest ( "MTCH__MITCH_MTACH_MATCH_MATCH", 1 ) );
334         REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
335         REQUIRE_EQ ( 18, match_info.position );
336         REQUIRE_EQ ( 0, match_info.score );
337     }
338     // First match
339     {
340         REQUIRE ( FindFirst ( "MTCH__MITCH_MTACH_MATCH_MATCH", 1 ) );
341         REQUIRE_EQ ( pattern_len - 1, (size_t)match_info.length );
342         REQUIRE_EQ ( 0, match_info.position );
343         REQUIRE_EQ ( 1, match_info.score );
344     }
345 
346     // Match anything
347     {   // threshold >= pattern_len seems to specify that a complete mismatch is acceptable
348         const string text = "xyzvuwpiuuuu";
349         REQUIRE ( FindFirst ( text, pattern_len ) );
350 //        REQUIRE_EQ ( text . size (), (size_t)match_info.length );
351         REQUIRE_EQ ( 1, match_info.length ); //FIXME: different from other algorithms
352         REQUIRE_EQ ( 0, match_info.position );
353         REQUIRE_EQ ( pattern_len, (size_t)match_info.score );
354     }
355 
356     // Not found
357     {
358         REQUIRE ( ! FindFirst ( "xyzvuwpiu", 4 ) );
359     }
360 }
361 
FIXTURE_TEST_CASE(AgrepMyersUnltdTest,AgrepFixture)362 FIXTURE_TEST_CASE ( AgrepMyersUnltdTest, AgrepFixture )
363 {
364     REQUIRE_RC ( Setup ( "MATCH", AGREP_ALG_MYERS_UNLTD ) );
365 
366     // Complete match
367     {
368         REQUIRE ( FindFirst ( "MATCH", 0 ) );
369         REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
370         REQUIRE_EQ ( 0, match_info.position );
371         REQUIRE_EQ ( 0, match_info.score );
372     }
373 
374     // Complete substring match
375     {
376         REQUIRE ( FindFirst ( "xxMATCHvv", 0 ) );
377         REQUIRE_EQ ( pattern_len, (size_t)match_info.length );
378         REQUIRE_EQ ( 2, match_info.position );
379         REQUIRE_EQ ( 0, match_info.score );
380     }
381     // 1 Deletion
382     {
383         REQUIRE ( FindFirst ( "xxxMACHvv", 1 ) );
384         REQUIRE_EQ ( pattern_len - 1, (size_t)match_info.length );
385         REQUIRE_EQ ( 3, match_info.position );
386         REQUIRE_EQ ( 1, match_info.score );
387     }
388 
389     // 2 Insertions
390     {
391         REQUIRE ( FindFirst ( "xxxMAdTCaHvv", 2 ) );
392 //        REQUIRE_EQ ( pattern_len + 2, (size_t)match_info.length );
393         REQUIRE_EQ ( (size_t)pattern_len - 1, (size_t)match_info.length ); //FIXME: different from other algorithms
394         REQUIRE_EQ ( (int32_t)3, match_info.position );
395         REQUIRE_EQ ( (int32_t)2, match_info.score );
396     }
397 
398     // 3 Mismatches
399     {
400         REQUIRE ( FindFirst ( "xATxx", 5 ) );
401         REQUIRE_EQ ( 2, match_info.length );
402         REQUIRE_EQ ( 1, match_info.position );
403         REQUIRE_EQ ( 3, match_info.score );
404     }
405 
406     // Best match
407     {
408         REQUIRE ( FindBest ( "MTCH__MITCH_MTACH_MATCH_MATCH", 1 ) );
409         REQUIRE ( (size_t)match_info.length == pattern_len );
410         REQUIRE ( match_info.position == 18 );
411         REQUIRE ( match_info.score == 0 );
412     }
413     // First match
414     {
415         REQUIRE ( FindFirst ( "MTCH__MITCH_MTACH_MATCH_MATCH", 1 ) );
416         REQUIRE ( (size_t)match_info.length == pattern_len - 1 );
417         REQUIRE ( match_info.position == 0 );
418         REQUIRE ( match_info.score == 1 );
419     }
420 
421     // Match anything
422     {   // threshold >= pattern_len seems to specify that a complete mismatch is acceptable
423         const string text = "xyzvuwpiuuuu";
424         REQUIRE ( FindFirst ( text, pattern_len ) );
425 //        REQUIRE_EQ ( text . size (), (size_t)match_info.length );
426         REQUIRE_EQ ( 1, match_info.length ); //FIXME: different from other algorithms
427         REQUIRE_EQ ( 0, match_info.position );
428         REQUIRE_EQ ( pattern_len, (size_t)match_info.score );
429     }
430 
431     // Not found
432     {
433         REQUIRE ( ! FindFirst ( "xyzvuwpiu", 4 ) );
434     }
435 }
436 
437 
TEST_CASE(SearchCompare)438 TEST_CASE(SearchCompare)
439 {
440     //std::cout << "This is search algorithm time comparison test" << std::endl << std::endl;
441     std::cout << "SearchCompare test is currently OFF" << std::endl;
442 
443     /*
444     TimeTest("SRR067432", SearchTimeTest::ALGORITHM_AGREP);
445     TimeTest("SRR067432", SearchTimeTest::ALGORITHM_LCS_DP);
446     TimeTest("SRR067432", SearchTimeTest::ALGORITHM_NUCSTRSTR);
447     TimeTest("SRR067432", SearchTimeTest::ALGORITHM_LCS_SAIS);*/
448 
449     //std::cout << std::endl << "Time comparison test acomplished" << std::endl;
450 }
451 
TEST_CASE(TestCaseAgrep)452 TEST_CASE(TestCaseAgrep)
453 {
454     std::cout << "TestCaseAgrep test is currently OFF" << std::endl;
455     //VDBSearchTest::CFindLinker().Run(AGREP_MODE_ASCII|AGREP_ALG_MYERS, "SRR067432");//"SRR068408");
456     //VDBSearchTest::CFindLinker().Run(AGREP_MODE_ASCII|AGREP_ALG_MYERS, "SRR067408_extr");
457 
458 /*
459     char const pszRunNamesFile[] = "454Runs.txt";
460     FILE* fRuns = fopen(pszRunNamesFile, "r");
461     if (!fRuns)
462     {
463         printf("Failed to open file: %s\n", pszRunNamesFile);
464         return;
465     }
466     bool bSkipping = true;
467     for (; !feof(fRuns); )
468     {
469         char szRuns[512] = "";
470         char* psz = fgets(szRuns, sizeof(szRuns), fRuns);
471         if (!psz)
472             break;
473         trim_eol(psz);
474         if (bSkipping && !strcmp(psz, "SRR067408"))
475         {
476             bSkipping = false;
477         }
478         if (bSkipping)
479             continue;
480         VDBSearchTest::CFindLinker().Run(AGREP_MODE_ASCII|AGREP_ALG_MYERS, psz);
481     }*/
482 }
483 
484 // Fgrep
485 
RunFgrep(FgrepFlags p_alg)486 static void RunFgrep ( FgrepFlags p_alg )
487 {   // VDB-2669: creates uninitialized memory
488     Fgrep* fg;
489     const char* queries[] = { "RRRRRAH" };
490     if ( FgrepMake ( & fg, FGREP_MODE_ASCII | p_alg, queries, 1 ) ) // this used to leave uninitialized memory ...
491         throw logic_error ( "RunFgrep: FgrepMake() failed" );
492 
493     const std::string str ( 1000, 'A' );
494     FgrepMatch matchinfo;
495     if ( 0 != FgrepFindFirst ( fg, str . data (), str . size (), & matchinfo ) ) // ... the use of which showed up in valgrind here, and sometimes caused a crash
496         throw logic_error ( "RunFgrep: FgrepFindFirst() found a false match" );
497 
498     FgrepFree ( fg );
499 }
500 
TEST_CASE(DumbGrep_Crash)501 TEST_CASE ( DumbGrep_Crash )
502 {
503     RunFgrep ( FGREP_ALG_DUMB );
504 }
505 
TEST_CASE(BoyerMooreGrep_Crash)506 TEST_CASE ( BoyerMooreGrep_Crash )
507 {
508     RunFgrep ( FGREP_ALG_BOYERMOORE );
509 }
510 
TEST_CASE(AhoGrep_Crash)511 TEST_CASE ( AhoGrep_Crash )
512 {
513     RunFgrep ( FGREP_ALG_AHOCORASICK );
514 }
515 
516 // Smith-Waterman
517 
518 class SmithWatermanFixture
519 {
520 public:
SmithWatermanFixture()521 	SmithWatermanFixture()
522 	:  m_self ( 0 ),
523        m_pattern_len ( 0 )
524 	{
525 	}
~SmithWatermanFixture()526 	~SmithWatermanFixture()
527 	{
528 		::SmithWatermanWhack ( m_self );
529 	}
530 
Setup(const string & p_query)531 	rc_t Setup ( const string& p_query )
532 	{
533         m_pattern_len = p_query . size ();
534 		return ::SmithWatermanMake ( & m_self, p_query . data() );
535 	}
536 
FindFirst(const string & p_text,uint32_t p_threshold=numeric_limits<uint32_t>::max ())537 	bool FindFirst ( const string& p_text, uint32_t p_threshold = numeric_limits<uint32_t>::max() )
538 	{
539 		return ::SmithWatermanFindFirst ( m_self,  p_threshold, p_text . c_str (), p_text . size (), & m_match_info ) == 0;
540 	}
541 
542 public:
543 	::SmithWaterman*       m_self;
544 	size_t                 m_pattern_len;
545     ::SmithWatermanMatch   m_match_info;
546 };
547 
FIXTURE_TEST_CASE(SmithWatermanTest,SmithWatermanFixture)548 FIXTURE_TEST_CASE ( SmithWatermanTest, SmithWatermanFixture )
549 {
550     REQUIRE_RC ( Setup ( "MATCH" ) );
551 
552     // VDB-3034: crash on query of length 0, when called right after initialization of the search object
553     {
554         REQUIRE(!FindFirst(""));
555     }
556 
557 	// threshold in FirstMatch varies from 0 (a complete mismatch is acceptable) to 2*m_pattern_len (perfect match)
558 
559     // Complete match
560     {
561         REQUIRE ( FindFirst ( "MATCH" ) );
562         REQUIRE_EQ ( m_pattern_len,  ( size_t ) m_match_info.length );
563         REQUIRE_EQ ( 0, m_match_info.position );
564         REQUIRE_EQ ( 2 * m_pattern_len, ( size_t ) m_match_info.score );
565     }
566     // Complete substring match
567     {
568         REQUIRE ( FindFirst ( "xxMATCHvv", 0 ) );
569         REQUIRE_EQ ( m_pattern_len, (size_t)m_match_info.length );
570         REQUIRE_EQ ( 2, m_match_info.position );
571         REQUIRE_EQ ( 2 * m_pattern_len, ( size_t ) m_match_info.score );
572     }
573     // 1 Deletion
574     {
575         REQUIRE ( FindFirst ( "xxxMACHvv", 7 ) );
576         REQUIRE_EQ ( m_pattern_len - 1, (size_t)m_match_info.length );
577         REQUIRE_EQ ( 3, m_match_info.position );
578         REQUIRE_EQ ( 7, m_match_info.score );
579     }
580 
581     // 2 Insertions
582     {
583         REQUIRE ( FindFirst ( "xxxMAdTCaHvv", 8 ) );
584         REQUIRE_EQ ( m_pattern_len + 2, (size_t)m_match_info.length );
585         REQUIRE_EQ ( 3, m_match_info.position );
586         REQUIRE_EQ ( 8, m_match_info.score );
587     }
588 
589     // 3 Mismatches
590     {
591         REQUIRE ( FindFirst ( "xATxx", 4 ) );
592         REQUIRE_EQ ( 2, m_match_info.length );
593         REQUIRE_EQ ( 1, m_match_info.position );
594         REQUIRE_EQ ( 4, m_match_info.score );
595     }
596 
597     // First match
598     {
599         REQUIRE ( FindFirst ( "MTCH__MITCH_MTACH_MATCH_MATCH" ) );
600         REQUIRE_EQ ( m_pattern_len, (size_t)m_match_info.length );
601         REQUIRE_EQ ( 18, m_match_info.position );
602         REQUIRE_EQ ( 2 * m_pattern_len,  (size_t)m_match_info.score );
603     }
604 
605     // Match anything
606     {   // threshold is from 0 (a complete mismatch is acceptable) to 2*m_pattern_len (perfect match)
607         const string text = "xyzvuwpiuuuu";
608         REQUIRE ( FindFirst ( text, 0 ) );
609         REQUIRE_EQ ( 0, m_match_info.length );
610         REQUIRE_EQ ( 0, m_match_info.position );
611         REQUIRE_EQ ( 0, m_match_info.score );
612     }
613 
614     // Not found
615     {
616         REQUIRE ( ! FindFirst ( "xyzvuwpiu", 1 ) );
617     }
618 
619 }
620 
621 // Ref-Variation
622 #if 0
623 static
624 void
625 PrintMatrix ( const INSDC_dna_text* p_ref, const INSDC_dna_text* p_query, const int p_matrix[], size_t p_rows, size_t p_cols )
626 {
627     cout << "    ";
628     while ( *p_query )
629     {
630         cout << " " << *p_query;
631         ++p_query;
632     }
633     cout << endl;
634     for ( size_t i = 0; i < p_rows - 1; ++i ) // skip row 0 ( all 0s )
635     {
636         cout << i << " " << p_ref[i] << ": ";
637         for ( size_t j = 0; j < p_cols - 1; ++j ) // skip the 0 at the start
638         {
639             cout << p_matrix [ ( i + 1 ) * p_cols + j + 1 ] << " ";
640         }
641         cout << endl;
642     }
643 }
644 #endif
645 
TEST_CASE(RefVariation_crash)646 TEST_CASE ( RefVariation_crash)
647 {
648     RefVariation* self;
649     INSDC_dna_text ref[] = "ACGTACGTACGTACGTACGTACGTACGTACGT";
650     REQUIRE_RC_FAIL ( RefVariationIUPACMake ( &self, ref, string_size ( ref ), 0, 0, "", 0, ::refvarAlgSW ) );
651 }
652 
653 #ifdef _WIN32
654 #define PRSIZET "I"
655 #else
656 #define PRSIZET "z"
657 #endif
658 
print_refvar_obj(::RefVariation const * obj)659 string print_refvar_obj (::RefVariation const* obj)
660 {
661     size_t allele_len = 0, allele_start = 0, allele_len_on_ref = 0;
662     char const* allele = NULL;
663     ::RefVariationGetAllele( obj, & allele, & allele_len, & allele_start );
664     ::RefVariationGetAlleleLenOnRef ( obj, & allele_len_on_ref );
665 
666     //printf ("<no ref name>:%"PRSIZET"u:%"PRSIZET"u:%.*s\n", allele_start, allele_len_on_ref, (int)allele_len, allele);
667 
668     return string ( allele, allele_len );
669 }
670 
671 #undef PRSIZET
672 
673 
vrefvar_bounds(::RefVarAlg alg,char const * ref,size_t ref_len,size_t pos,size_t len_on_ref,char const * query,size_t query_len)674 string vrefvar_bounds (::RefVarAlg alg, char const* ref,
675     size_t ref_len, size_t pos, size_t len_on_ref,
676     char const* query, size_t query_len)
677 {
678 
679     ::RefVariation* obj;
680 
681     rc_t rc = ::RefVariationIUPACMake ( & obj, ref, ref_len, pos, len_on_ref, query, query_len, alg );
682 
683     string ret = print_refvar_obj ( obj );
684 
685     if ( rc == 0 )
686         ::RefVariationRelease( obj );
687 
688     return ret;
689 }
690 
vrefvar_bounds_n(::RefVarAlg alg)691 string vrefvar_bounds_n(::RefVarAlg alg)
692 {
693     //                  01234567890123456789
694     char const ref[] = "NNNNNNNNNNTAACCCTAAC";
695     //                       CCCCTTAGG-
696 
697     size_t pos = 5, len_on_ref = 10;
698     char const query[] = "CCCCTTAGG";
699 
700     return vrefvar_bounds ( alg, ref, strlen(ref), pos, len_on_ref, query, strlen(query) );
701 }
702 
vrefvar_bounds_0(::RefVarAlg alg)703 string vrefvar_bounds_0(::RefVarAlg alg)
704 {
705     //                  01234567890123456789
706     char const ref[] = "TAACCCTAAC";
707     //                  TTAGG-
708 
709     size_t pos = 0, len_on_ref = 5;
710     char const query[] = "TAGG";
711 
712     return vrefvar_bounds ( alg, ref, strlen(ref), pos, len_on_ref, query, strlen(query) );
713 }
714 
vrefvar_bounds_N0(::RefVarAlg alg)715 string vrefvar_bounds_N0(::RefVarAlg alg)
716 {
717     //                  01234567890123456789
718     char const ref[] = "NNNNNTAACCCTAAC";
719     //                  CCCCTTTAGG-
720 
721     size_t pos = 0, len_on_ref = 10;
722     char const query[] = "CCCCTTAGG";
723 
724     return vrefvar_bounds ( alg, ref, strlen(ref), pos, len_on_ref, query, strlen(query) );
725 }
726 
TEST_CASE(RefVariation_bounds_N)727 TEST_CASE ( RefVariation_bounds_N )
728 {
729     //printf ("TODO: this test is derived from the real example which hangs up now (2015-12-14):\n");
730     //printf ("echo \"67068302 NC_000001.10:9995:10:CCCCTTAGG\" | var-expand --algorithm=sw\n");
731 
732     REQUIRE_EQ ( string ( "NNNNNCCCCTTAGGCTAA" ), vrefvar_bounds_n ( ::refvarAlgSW ) );
733     REQUIRE_EQ ( string ( "CCCCTTAGGC" ), vrefvar_bounds_n ( ::refvarAlgRA ) );
734 
735     REQUIRE_EQ ( string ( "AGGC" ), vrefvar_bounds_0 ( ::refvarAlgSW ) );
736     REQUIRE_EQ ( string ( "TAGG" ), vrefvar_bounds_0 ( ::refvarAlgRA ) );
737 
738     REQUIRE_EQ ( string ( "CCCCTTAGGCTAA" ), vrefvar_bounds_N0 ( ::refvarAlgSW ) );
739     REQUIRE_EQ ( string ( "CCCCTTAGGC" ), vrefvar_bounds_N0 ( ::refvarAlgRA ) );
740 }
741 
742 // Nucstrstr
743 static
744 void
ConvertAsciiTo2NAPacked(const string & p_read,unsigned char * pBuf2NA,size_t nBuf2NASize)745 ConvertAsciiTo2NAPacked ( const string& p_read, unsigned char* pBuf2NA, size_t nBuf2NASize )
746 {
747     static unsigned char map [ 1 << ( sizeof ( char ) * 8 ) ];
748     map['A'] = map['a'] = 0;
749     map['C'] = map['c'] = 1;
750     map['G'] = map['g'] = 2;
751     map['T'] = map['t'] = 3;
752 
753     static size_t shiftLeft [ 4 ] = { 6, 4, 2, 0 };
754 
755     fill ( pBuf2NA, pBuf2NA + nBuf2NASize, 0 );
756 
757     for ( size_t iChar = 0; iChar < p_read . size (); ++iChar )
758     {
759         size_t iByte = iChar / 4;
760         if ( iByte > nBuf2NASize )
761         {
762             assert ( false );
763             break;
764         }
765 
766         pBuf2NA[iByte] |= map [ size_t ( p_read [ iChar ] ) ] << shiftLeft [ iChar % 4 ];
767     }
768 }
769 
770 static
771 int
RunNucStrtr(const string & p_ref,const string & p_query,bool p_positional)772 RunNucStrtr ( const string& p_ref, const string& p_query, bool p_positional )
773 {
774     unsigned char buf2na [ 1024 ];
775     ConvertAsciiTo2NAPacked ( p_ref, buf2na, sizeof ( buf2na ) );
776 
777     NucStrstr *nss;
778     if ( NucStrstrMake ( & nss, p_positional ? 1 : 0, p_query . c_str (), p_query . size () ) != 0 )
779         throw logic_error ( "RunNucStrtr: NucStrstrMake() failed" );
780     unsigned int selflen = 0u;
781     // NB: for now, all searches start are from the beginning ( nucstrstr.h says "may be >= 4"; not sure what that means )
782     const unsigned int pos = 0;
783     int ret = NucStrstrSearch ( nss, ( const void * ) buf2na, pos, p_ref . size () - pos, & selflen );
784     NucStrstrWhack ( nss );
785     return ret;
786 }
787 
TEST_CASE(Nucstrstr_NonPositional_NotFound)788 TEST_CASE ( Nucstrstr_NonPositional_NotFound )
789 {
790     REQUIRE_EQ ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "ACTA", false ) );
791 }
792 
TEST_CASE(Nucstrstr_NonPositional_Found_AtStart)793 TEST_CASE ( Nucstrstr_NonPositional_Found_AtStart )
794 {
795     REQUIRE_NE ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "ACGTACGTACG", false ) );
796 }
797 
TEST_CASE(Nucstrstr_NonPositional_Found_InMiddle)798 TEST_CASE ( Nucstrstr_NonPositional_Found_InMiddle )
799 {
800     REQUIRE_NE ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "GTACGTACG", false ) );
801 }
802 
TEST_CASE(Nucstrstr_Positional_NoExpr_NotFound)803 TEST_CASE ( Nucstrstr_Positional_NoExpr_NotFound )
804 {
805     REQUIRE_EQ ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "ACCGT", true ) );
806 }
807 
TEST_CASE(Nucstrstr_Positional_NoExpr_Found_AtStart)808 TEST_CASE ( Nucstrstr_Positional_NoExpr_Found_AtStart )
809 {
810     REQUIRE_EQ ( 1, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "ACGTACGTACG", true ) );
811 }
812 
TEST_CASE(Nucstrstr_Positional_NoExpr_Found_InMiddle)813 TEST_CASE ( Nucstrstr_Positional_NoExpr_Found_InMiddle )
814 {
815     REQUIRE_EQ ( 4, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "TACGTACG", true ) );
816 }
817 
TEST_CASE(Nucstrstr_Positional_Not_False)818 TEST_CASE ( Nucstrstr_Positional_Not_False )
819 {
820     REQUIRE_EQ ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "!TACGTACG", true ) );
821 }
TEST_CASE(Nucstrstr_Positional_Not_True)822 TEST_CASE ( Nucstrstr_Positional_Not_True )
823 {   // "!" returns only 0 or 1, not the position
824     REQUIRE_EQ ( 1, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "!AG", true ) );
825 }
826 
TEST_CASE(Nucstrstr_Positional_Caret_Found)827 TEST_CASE ( Nucstrstr_Positional_Caret_Found )
828 {
829     REQUIRE_EQ ( 1, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "^AC", true ) );
830 }
TEST_CASE(Nucstrstr_Positional_Caret_NotFound)831 TEST_CASE ( Nucstrstr_Positional_Caret_NotFound )
832 {
833     REQUIRE_EQ ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "^C", true ) );
834 }
835 
TEST_CASE(Nucstrstr_Positional_Dollar_Found)836 TEST_CASE ( Nucstrstr_Positional_Dollar_Found )
837 {
838     REQUIRE_EQ ( 33, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGTTGCA", "TGCA$", true ) );
839 }
TEST_CASE(Nucstrstr_Positional_Dollar_NotFound)840 TEST_CASE ( Nucstrstr_Positional_Dollar_NotFound )
841 {
842     REQUIRE_EQ ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGTTGCAA", "TGCA$", true ) );
843 }
844 
TEST_CASE(Nucstrstr_Positional_OR_Found_1)845 TEST_CASE ( Nucstrstr_Positional_OR_Found_1 )
846 {
847     REQUIRE_EQ ( 4, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGTTGCA", "TGAA|TACG", true ) );
848 }
TEST_CASE(Nucstrstr_Positional_OR_Found_2)849 TEST_CASE ( Nucstrstr_Positional_OR_Found_2 )
850 {
851     REQUIRE_EQ ( 4, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGTTGCA", "TGAA||TACG", true ) );
852 }
TEST_CASE(Nucstrstr_Positional_OR_Found_FirstMatchPositionReported)853 TEST_CASE ( Nucstrstr_Positional_OR_Found_FirstMatchPositionReported )
854 {
855     REQUIRE_EQ ( 2, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGTTGCA", "TGAA|CGT|TACG", true ) );
856 }
TEST_CASE(Nucstrstr_Positional_OR_NotFound)857 TEST_CASE ( Nucstrstr_Positional_OR_NotFound )
858 {
859     REQUIRE_EQ ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGTTGCA", "TGAA|TACA", true ) );
860 }
861 
TEST_CASE(Nucstrstr_Positional_AND_Found_LastMatchPositionReported)862 TEST_CASE ( Nucstrstr_Positional_AND_Found_LastMatchPositionReported )
863 {
864     REQUIRE_EQ ( 4, RunNucStrtr ( "ACGTACGT", "CGTA&TACG", true ) );
865 }
TEST_CASE(Nucstrstr_Positional_AND_NotFound)866 TEST_CASE ( Nucstrstr_Positional_AND_NotFound )
867 {
868     REQUIRE_EQ ( 0, RunNucStrtr ( "ACGTACGT", "TACG&TACA", true ) );
869 }
870 
TEST_CASE(Nucstrstr_Positional_AND_OR_Found_1)871 TEST_CASE ( Nucstrstr_Positional_AND_OR_Found_1 )
872 {
873     REQUIRE_EQ ( 3, RunNucStrtr ( "ACGTACGT", "CGTA&TACC|GTAC", true ) );
874 }
TEST_CASE(Nucstrstr_Positional_AND_OR_Found_2)875 TEST_CASE ( Nucstrstr_Positional_AND_OR_Found_2 )
876 {
877     REQUIRE_EQ ( 3, RunNucStrtr ( "ACGTACGT", "CGTA&(TACC|GTAC)", true ) );
878 }
TEST_CASE(Nucstrstr_Positional_AND_OR_Found_3)879 TEST_CASE ( Nucstrstr_Positional_AND_OR_Found_3 )
880 {
881     REQUIRE_EQ ( 2, RunNucStrtr ( "ACGTACGT", "(TACC|GTAC)&&CGTA", true ) );
882 }
883 
TEST_CASE(Nucstrstr_Positional_AND_OR_NOT_NotFound)884 TEST_CASE ( Nucstrstr_Positional_AND_OR_NOT_NotFound )
885 {
886     REQUIRE_EQ ( 0, RunNucStrtr ( "ACGTACGT", "(TACC|GTAC)&&!CGTA", true ) );
887 }
TEST_CASE(Nucstrstr_Positional_AND_OR_NOT_Found)888 TEST_CASE ( Nucstrstr_Positional_AND_OR_NOT_Found )
889 {
890     REQUIRE_EQ ( 1, RunNucStrtr ( "ACGTACGT", "(TACC|GTAC)&&!CATA", true ) );
891 }
892 
TEST_CASE(Nucstrstr_Positional_Error)893 TEST_CASE ( Nucstrstr_Positional_Error )
894 {
895     REQUIRE_THROW ( RunNucStrtr ( "ACGTACGT", "(TACC", true ) );
896 }
897 
TEST_CASE(Nucstrstr_NonPositional_4NA_NotFound)898 TEST_CASE ( Nucstrstr_NonPositional_4NA_NotFound )
899 {
900     REQUIRE_EQ ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "ACTN", false ) );
901 }
902 
TEST_CASE(Nucstrstr_NonPositional_4NA_Found_AtStart)903 TEST_CASE ( Nucstrstr_NonPositional_4NA_Found_AtStart )
904 {
905     REQUIRE_NE ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "ACGTACGTACN", false ) );
906 }
907 
TEST_CASE(Nucstrstr_NonPositional_4NA_Found_InMiddle)908 TEST_CASE ( Nucstrstr_NonPositional_4NA_Found_InMiddle )
909 {
910     REQUIRE_NE ( 0, RunNucStrtr ( "ACGTACGTACGTACGTACGTACGTACGTACGT", "GTACGTACN", false ) );
911 }
912 
913 
914 
915 //////////////////////////////////////////// Main
916 extern "C"
917 {
918 
919 #include <kapp/args.h>
920 #include <kfg/config.h>
921 
KAppVersion(void)922 ver_t CC KAppVersion ( void )
923 {
924     return 0x1000000;
925 }
UsageSummary(const char * progname)926 rc_t CC UsageSummary (const char * progname)
927 {
928     return 0;
929 }
930 
Usage(const Args * args)931 rc_t CC Usage ( const Args * args )
932 {
933     return 0;
934 }
935 
936 const char UsageDefaultName[] = "test-search";
937 
KMain(int argc,char * argv[])938 rc_t CC KMain ( int argc, char *argv [] )
939 {
940     KConfigDisableUserSettings();
941     rc_t rc = TestSuiteSearch(argc, argv);
942     return rc;
943 }
944 
945 } // end of extern "C"
946