1 /*  $Id: unit_test_string_constraint.cpp 571441 2018-09-26 16:07:20Z kachalos $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Colleen Bollin
27 *
28 * File Description:
29 *   Simple unit test for CString_constraint.
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include <objects/macro/String_constraint.hpp>
37 #include <objects/macro/String_location.hpp>
38 #include <objects/macro/Word_substitution.hpp>
39 #include <objects/macro/Word_substitution_set.hpp>
40 #include <objects/macro/Suspect_rule.hpp>
41 #include <objects/macro/Replace_rule.hpp>
42 #include <objects/macro/Replace_func.hpp>
43 #include <objects/macro/Simple_replace.hpp>
44 #include <objects/macro/Search_func.hpp>
45 
46 #include <corelib/ncbiapp.hpp>
47 #include <corelib/test_boost.hpp>
48 
49 #include <util/util_misc.hpp>
50 
51 #include <common/test_assert.h>  /* This header must go last */
52 
53 USING_NCBI_SCOPE;
54 USING_SCOPE(objects);
55 
NCBITEST_AUTO_INIT()56 NCBITEST_AUTO_INIT()
57 {
58 }
59 
BOOST_AUTO_TEST_CASE(Test_WordSubstitution)60 BOOST_AUTO_TEST_CASE(Test_WordSubstitution)
61 {
62     CWord_substitution word;
63 
64     word.SetWord("fruit");
65     word.SetSynonyms().push_back("apple");
66     word.SetSynonyms().push_back("orange");
67     word.SetSynonyms().push_back("pear");
68     word.SetSynonyms().push_back("grapefruit");
69     word.SetSynonyms().push_back("fruit, canned");
70 
71     vector<size_t> match_lens = word.GetMatchLens("fruit, canned", "fruit", 0);
72     BOOST_CHECK_EQUAL(match_lens.size(), 1);
73     BOOST_CHECK_EQUAL(match_lens[0], 13);
74 }
75 
76 
BOOST_AUTO_TEST_CASE(Test_SimpleConstraints)77 BOOST_AUTO_TEST_CASE(Test_SimpleConstraints)
78 {
79     CString_constraint s;
80 
81     s.SetMatch_text("cat");
82     s.SetMatch_location(eString_location_contains);
83 
84     BOOST_CHECK_EQUAL(s.Match("cat"), true);
85     BOOST_CHECK_EQUAL(s.Match("catalog"), true);
86     BOOST_CHECK_EQUAL(s.Match("the catalog"), true);
87     BOOST_CHECK_EQUAL(s.Match("ducat"), true);
88     BOOST_CHECK_EQUAL(s.Match("dog"), false);
89     BOOST_CHECK_EQUAL(s.Match("dog, cat, cow"), true);
90 
91     s.SetMatch_location(eString_location_equals);
92     BOOST_CHECK_EQUAL(s.Match("cat"), true);
93     BOOST_CHECK_EQUAL(s.Match("catalog"), false);
94     BOOST_CHECK_EQUAL(s.Match("the catalog"), false);
95     BOOST_CHECK_EQUAL(s.Match("ducat"), false);
96     BOOST_CHECK_EQUAL(s.Match("dog"), false);
97     BOOST_CHECK_EQUAL(s.Match("dog, cat, cow"), false);
98 
99     s.SetMatch_location(eString_location_starts);
100     BOOST_CHECK_EQUAL(s.Match("cat"), true);
101     BOOST_CHECK_EQUAL(s.Match("catalog"), true);
102     BOOST_CHECK_EQUAL(s.Match("the catalog"), false);
103     BOOST_CHECK_EQUAL(s.Match("ducat"), false);
104     BOOST_CHECK_EQUAL(s.Match("dog"), false);
105     BOOST_CHECK_EQUAL(s.Match("dog, cat, cow"), false);
106 
107     s.SetMatch_location(eString_location_ends);
108     BOOST_CHECK_EQUAL(s.Match("cat"), true);
109     BOOST_CHECK_EQUAL(s.Match("catalog"), false);
110     BOOST_CHECK_EQUAL(s.Match("the catalog"), false);
111     BOOST_CHECK_EQUAL(s.Match("ducat"), true);
112     BOOST_CHECK_EQUAL(s.Match("dog"), false);
113     BOOST_CHECK_EQUAL(s.Match("dog, cat, cow"), false);
114 
115 	// eString_location_inlist - no longer supported
116 	//s.SetMatch_location(eString_location_inlist);
117     //BOOST_CHECK_EQUAL(s.Match("cat"), true);
118     //BOOST_CHECK_EQUAL(s.Match("catalog"), false);
119     //BOOST_CHECK_EQUAL(s.Match("the catalog"), false);
120     //BOOST_CHECK_EQUAL(s.Match("ducat"), false);
121     //BOOST_CHECK_EQUAL(s.Match("dog"), false);
122     //BOOST_CHECK_EQUAL(s.Match("dog,cat,cow"), false); // because list is in constraint
123 
124     //s.SetMatch_text("dog, cat, cow");
125     //BOOST_CHECK_EQUAL(s.Match("cat"), true);
126     //BOOST_CHECK_EQUAL(s.Match("catalog"), false);
127     //BOOST_CHECK_EQUAL(s.Match("the catalog"), false);
128     //BOOST_CHECK_EQUAL(s.Match("ducat"), false);
129     //BOOST_CHECK_EQUAL(s.Match("dog"), true);
130 
131 	s.SetMatch_text("dog, cat, cow");
132 	s.SetMatch_location(eString_location_contains);
133     s.SetIgnore_punct(true);
134     BOOST_CHECK_EQUAL(s.Match("dog cat cow"), true);
135     BOOST_CHECK_EQUAL(s.Match("dog  cat cow"), false);
136     BOOST_CHECK_EQUAL(s.Match("dogcatcow"), false);
137     BOOST_CHECK_EQUAL(s.Match("dog.cat.cow"), false);
138     BOOST_CHECK_EQUAL(s.Match("dog,cat,cow"), false);
139 
140     s.SetIgnore_space(true);
141     BOOST_CHECK_EQUAL(s.Match("dog cat cow"), true);
142     BOOST_CHECK_EQUAL(s.Match("dog  cat cow"), true);
143     BOOST_CHECK_EQUAL(s.Match("dogcatcow"), true);
144     BOOST_CHECK_EQUAL(s.Match("dog.cat.cow"), true);
145     BOOST_CHECK_EQUAL(s.Match("dog,cat,cow"), true);
146 
147     s.ResetIgnore_punct();
148     BOOST_CHECK_EQUAL(s.Match("dog cat cow"), false);
149     BOOST_CHECK_EQUAL(s.Match("dog  cat cow"), false);
150     BOOST_CHECK_EQUAL(s.Match("dogcatcow"), false);
151     BOOST_CHECK_EQUAL(s.Match("dog.cat.cow"), false);
152     BOOST_CHECK_EQUAL(s.Match("dog,cat,cow"), true);
153 
154     s.Reset();
155     s.SetMatch_text("cat");
156     s.SetWhole_word(true);
157     s.SetMatch_location(eString_location_contains);
158     BOOST_CHECK_EQUAL(s.Match("cat"), true);
159     BOOST_CHECK_EQUAL(s.Match("catalog"), false);
160     BOOST_CHECK_EQUAL(s.Match("the catalog"), false);
161     BOOST_CHECK_EQUAL(s.Match("ducat"), false);
162     BOOST_CHECK_EQUAL(s.Match("dog"), false);
163     BOOST_CHECK_EQUAL(s.Match("dog,cat,cow"), true);
164 
165 
166 
167     string in, out;
168     s.Reset();
169     s.SetMatch_text("cat");
170     s.SetMatch_location(eString_location_contains);
171 
172     in = "cat";
173     BOOST_CHECK(s.ReplaceStringConstraintPortionInString(out, in, "dog"));
174     BOOST_CHECK_EQUAL(out, "dog");
175 
176     in = "catalog";
177     BOOST_CHECK(s.ReplaceStringConstraintPortionInString(out, in, "dog"));
178     BOOST_CHECK_EQUAL(out, "dogalog");
179 
180     in = "the catalog";
181     BOOST_CHECK(s.ReplaceStringConstraintPortionInString(out, in, "dog"));
182     BOOST_CHECK_EQUAL(out, "the dogalog");
183 
184     in = "ducat";
185     BOOST_CHECK(s.ReplaceStringConstraintPortionInString(out, in, "dog"));
186     BOOST_CHECK_EQUAL(out, "dudog");
187 
188     in = "dog, cat, cow";
189     BOOST_CHECK(s.ReplaceStringConstraintPortionInString(out, in, "dog"));
190     BOOST_CHECK_EQUAL(out, "dog, dog, cow");
191 
192     in = "feline";
193     BOOST_CHECK(!s.ReplaceStringConstraintPortionInString(out, in, "dog"));
194     BOOST_CHECK_EQUAL(out, "feline");
195 
196 }
197 
198 
BOOST_AUTO_TEST_CASE(Test_StringConstraintWithSynonyms)199 BOOST_AUTO_TEST_CASE(Test_StringConstraintWithSynonyms)
200 {
201     string text = "The quick brown fox jumped over the lazy dog.";
202 
203     CString_constraint s;
204     s.SetMatch_location(eString_location_contains);
205     s.SetMatch_text("dog leaped");
206     CRef<CWord_substitution> subst1(new CWord_substitution("leap", "jump"));
207     s.SetIgnore_words().Set().push_back(subst1);
208     CRef<CWord_substitution> subst2(new CWord_substitution("dog", "fox"));
209     s.SetIgnore_words().Set().push_back(subst2);
210 
211     BOOST_CHECK_EQUAL(s.Match(text), true);
212 
213     s.Reset();
214     s.SetMatch_location(eString_location_equals);
215     s.SetMatch_text("A fast beige wolf leaped across a sleepy beagle.");
216     CRef<CWord_substitution> article(new CWord_substitution("a", "the"));
217     s.SetIgnore_words().Set().push_back(article);
218     CRef<CWord_substitution> speedy(new CWord_substitution("fast", "quick"));
219     s.SetIgnore_words().Set().push_back(speedy);
220     CRef<CWord_substitution> color(new CWord_substitution("beige", "brown"));
221     s.SetIgnore_words().Set().push_back(color);
222     CRef<CWord_substitution> wild(new CWord_substitution("wolf", "fox"));
223     s.SetIgnore_words().Set().push_back(wild);
224     CRef<CWord_substitution> hop(new CWord_substitution("leap", "jump"));
225     s.SetIgnore_words().Set().push_back(hop);
226     CRef<CWord_substitution> direction(new CWord_substitution("across", "over"));
227     s.SetIgnore_words().Set().push_back(direction);
228     CRef<CWord_substitution> tired(new CWord_substitution("sleepy", "lazy"));
229     s.SetIgnore_words().Set().push_back(tired);
230     CRef<CWord_substitution> tame(new CWord_substitution("beagle", "dog"));
231     s.SetIgnore_words().Set().push_back(tame);
232 
233     BOOST_CHECK_EQUAL(s.Match(text), true);
234 
235     // won't work if leap is whole word
236     hop->SetWhole_word(true);
237     BOOST_CHECK_EQUAL(s.Match(text), false);
238 
239     // won't work if articles are case sensitive
240     hop->SetWhole_word(false);
241     article->SetCase_sensitive(true);
242     BOOST_CHECK_EQUAL(s.Match(text), false);
243 
244 }
245 
BOOST_AUTO_TEST_CASE(Test_synonyms)246 BOOST_AUTO_TEST_CASE(Test_synonyms)
247 {
248     // string_constraint with ignore-words
249     CString_constraint s;
250     s.SetMatch_text("Homo sapiens");
251     s.SetMatch_location(eString_location_equals);
252     s.SetIgnore_space(true);
253     s.SetIgnore_punct(true);
254 
255     CRef <CWord_substitution> word_sub(new CWord_substitution);
256     word_sub->SetWord("Homo sapiens");
257     list <string> syns;
258     syns.push_back("human");
259     syns.push_back("Homo sapien");
260     syns.push_back("Homosapiens");
261     syns.push_back("Homo-sapiens");
262     syns.push_back("Homo spiens");
263     syns.push_back("Homo Sapience");
264     syns.push_back("homosapein");
265     syns.push_back("homosapiens");
266     syns.push_back("homosapien");
267     syns.push_back("homo_sapien");
268     syns.push_back("homo_sapiens");
269     syns.push_back("Homosipian");
270     word_sub->SetSynonyms() = syns;
271     s.SetIgnore_words().Set().push_back(word_sub);
272 
273     CRef <CWord_substitution> word_sub2(new CWord_substitution);
274     word_sub2->SetWord("sapiens");
275     syns.clear();
276     syns.push_back("sapien");
277     syns.push_back("sapeins");
278     syns.push_back("sapein");
279     syns.push_back("sapins");
280     syns.push_back("sapens");
281     syns.push_back("sapin");
282     syns.push_back("sapen");
283     syns.push_back("sapians");
284     syns.push_back("sapian");
285     syns.push_back("sapies");
286     syns.push_back("sapie");
287     word_sub2->SetSynonyms() = syns;
288     s.SetIgnore_words().Set().push_back(word_sub2);
289     string test = "human";
290     BOOST_CHECK_EQUAL(s.Match(test), true);
291     test = "humano";
292     BOOST_CHECK_EQUAL(s.Match(test), false);
293     test = "Homo sapien";
294     BOOST_CHECK_EQUAL(s.Match(test), true);
295     test = "Human sapien";
296     BOOST_CHECK_EQUAL(s.Match(test), false);
297     test = "sapien";
298     BOOST_CHECK_EQUAL(s.Match(test), false);
299 }
300 
301 
BOOST_AUTO_TEST_CASE(Test_SQD_2048)302 BOOST_AUTO_TEST_CASE(Test_SQD_2048)
303 {
304     CString_constraint s;
305     s.SetMatch_text("cytochrome b gene");
306     s.SetMatch_location(eString_location_equals);
307     s.SetCase_sensitive(false);
308     s.SetIgnore_space(true);
309     s.SetIgnore_punct(true);
310 
311     CRef<CWord_substitution> subst1(new CWord_substitution());
312     subst1->SetWord("cytochrome b gene");
313     subst1->SetSynonyms().push_back("cytochrome b cytb");
314     subst1->SetSynonyms().push_back("cytochrome b cyt b");
315     subst1->SetSynonyms().push_back("cytochrome b (cytb)");
316     subst1->SetSynonyms().push_back("cytochrome b (cyt b)");
317     subst1->SetCase_sensitive(false);
318     subst1->SetWhole_word(false);
319 
320     s.SetIgnore_words().Set().push_back(subst1);
321 
322     CRef<CWord_substitution> subst2(new CWord_substitution());
323     subst2->SetWord("gene");
324     subst2->SetSynonyms().push_back("sequence");
325     subst2->SetSynonyms().push_back("partial");
326     subst2->SetSynonyms().push_back("complete");
327     subst2->SetSynonyms().push_back("region");
328     subst2->SetSynonyms().push_back("partial sequence");
329     subst2->SetSynonyms().push_back("complete sequence");
330     subst2->SetCase_sensitive(false);
331     subst2->SetWhole_word(false);
332 
333     s.SetIgnore_words().Set().push_back(subst2);
334     s.SetWhole_word(false);
335     s.SetNot_present(false);
336     s.SetIs_all_caps(false);
337     s.SetIs_all_lower(false);
338     s.SetIs_all_punct(false);
339     s.SetIgnore_weasel(false);
340 
341     //NcbiCout << MSerial_AsnText << s;
342 
343     BOOST_CHECK_EQUAL(s.Match("cytochrome b gene"), true);
344     BOOST_CHECK_EQUAL(s.Match("cytochrome b partial"), true);
345     BOOST_CHECK_EQUAL(s.Match("cytb"), false);
346 }
347 
348 
BOOST_AUTO_TEST_CASE(Test_SQD_2093)349 BOOST_AUTO_TEST_CASE(Test_SQD_2093)
350 {
351     CSuspect_rule rule;
352 
353     rule.SetFind().SetString_constraint().SetMatch_text("localisation");
354     rule.SetFind().SetString_constraint().SetMatch_location(eString_location_contains);
355     rule.SetReplace().SetReplace_func().SetSimple_replace().SetReplace("localization");
356     rule.SetReplace().SetReplace_func().SetSimple_replace().SetWhole_string(false);
357     rule.SetReplace().SetReplace_func().SetSimple_replace().SetWeasel_to_putative(false);
358     rule.SetReplace().SetMove_to_note(false);
359 
360     string original = "Localisation of periplasmic protein complexes";
361     BOOST_CHECK_EQUAL(rule.GetFind().Match(original), true);
362     BOOST_CHECK_EQUAL(rule.ApplyToString(original), true);
363     BOOST_CHECK_EQUAL(original, "localization of periplasmic protein complexes");
364 
365 }
366 
367 
BOOST_AUTO_TEST_CASE(Test_CytochromeOxidase)368 BOOST_AUTO_TEST_CASE(Test_CytochromeOxidase)
369 {
370     CString_constraint s;
371     s.SetMatch_text("cytochrome oxidase subunit I gene");
372     s.SetMatch_location(eString_location_equals);
373     s.SetCase_sensitive(false);
374     s.SetIgnore_space(true);
375     s.SetIgnore_punct(true);
376 
377     CRef<CWord_substitution> subst1(new CWord_substitution());
378     subst1->SetWord("cytochrome oxidase subunit I gene");
379     subst1->SetSynonyms().push_back("cytochrome oxidase I gene");
380     subst1->SetSynonyms().push_back("cytochrome oxidase I");
381     subst1->SetSynonyms().push_back("cytochrome subunit I");
382     subst1->SetCase_sensitive(false);
383     subst1->SetWhole_word(false);
384 
385     s.SetIgnore_words().Set().push_back(subst1);
386 
387     CRef<CWord_substitution> subst2(new CWord_substitution());
388     subst2->SetWord("gene");
389     subst2->SetCase_sensitive(false);
390     subst2->SetWhole_word(false);
391     s.SetIgnore_words().Set().push_back(subst2);
392 
393     CRef<CWord_substitution> subst3(new CWord_substitution());
394     subst3->SetWord("gene");
395     /* Instead of having subst2, we can add the line below to subst3, the effect is the same
396      * subst3->SetSynonyms().push_back(kEmptyStr);
397      */
398     subst3->SetSynonyms().push_back("sequence");
399     subst3->SetSynonyms().push_back("partial");
400     subst3->SetSynonyms().push_back("complete");
401     subst3->SetSynonyms().push_back("region");
402     subst3->SetSynonyms().push_back("partial sequence");
403     subst3->SetSynonyms().push_back("complete sequence");
404     subst3->SetCase_sensitive(false);
405     subst3->SetWhole_word(false);
406     s.SetIgnore_words().Set().push_back(subst3);
407 
408     CRef<CWord_substitution> subst4(new CWord_substitution());
409     subst4->SetWord("oxidase");
410     subst4->SetSynonyms().push_back("oxydase");
411     subst4->SetCase_sensitive(false);
412     subst4->SetWhole_word(false);
413     s.SetIgnore_words().Set().push_back(subst4);
414 
415     s.SetWhole_word(false);
416     s.SetNot_present(false);
417     s.SetIs_all_caps(false);
418     s.SetIs_all_lower(false);
419     s.SetIs_all_punct(false);
420     s.SetIgnore_weasel(false);
421 
422     BOOST_CHECK_EQUAL(s.Match("cytochrome oxidase subunit I"), true);
423     BOOST_CHECK_EQUAL(s.Match("cytochrome oxydase subunit I"), true);
424     BOOST_CHECK_EQUAL(s.Match("cytochrome oxydase subunit I gene"), true);
425 }
426 
BOOST_AUTO_TEST_CASE(Test_AntigenGene)427 BOOST_AUTO_TEST_CASE(Test_AntigenGene)
428 {
429     CString_constraint s;
430     s.SetMatch_text("MHC CLASS II ANTIGEN gene");
431     s.SetMatch_location(eString_location_equals);
432     s.SetCase_sensitive(false);
433     s.SetIgnore_space(true);
434     s.SetIgnore_punct(true);
435 
436     CRef<CWord_substitution> subst2(new CWord_substitution());
437     subst2->SetWord("gene");
438     subst2->SetSynonyms().push_back("sequence");
439     subst2->SetSynonyms().push_back("partial");
440     subst2->SetSynonyms().push_back("complete");
441     subst2->SetSynonyms().push_back("region");
442     subst2->SetSynonyms().push_back("partial sequence");
443     subst2->SetSynonyms().push_back("complete sequence");
444     subst2->SetCase_sensitive(false);
445     subst2->SetWhole_word(false);
446     s.SetIgnore_words().Set().push_back(subst2);
447 
448 
449     s.SetWhole_word(false);
450     s.SetNot_present(false);
451     s.SetIs_all_caps(false);
452     s.SetIs_all_lower(false);
453     s.SetIs_all_punct(false);
454     s.SetIgnore_weasel(false);
455 
456     BOOST_CHECK_EQUAL(s.Match("MHC CLASS II ANTIGEN gene"), true);
457     BOOST_CHECK_EQUAL(s.Match("MHC class II antigen gene"), true);
458 }
459 
BOOST_AUTO_TEST_CASE(Test_Upper_LowerCases)460 BOOST_AUTO_TEST_CASE(Test_Upper_LowerCases)
461 {
462     CString_constraint s;
463     s.SetIs_all_caps(true);
464 
465     BOOST_CHECK_EQUAL(s.Match("MHC CLASS ii ANTIGEN gene"), false);
466     BOOST_CHECK_EQUAL(s.Match("ANTIGEN"), true);
467     BOOST_CHECK_EQUAL(s.Match("ANTIGEN GENE"), true);
468     BOOST_CHECK_EQUAL(s.Match("CLASS: ANTIGEN"), true);
469 
470     s.SetIs_all_caps(false);
471     s.SetIs_all_lower(true);
472 
473     BOOST_CHECK_EQUAL(s.Match("MHC CLASS ii ANTIGEN gene"), false);
474     BOOST_CHECK_EQUAL(s.Match("antigen"), true);
475     BOOST_CHECK_EQUAL(s.Match("antigen gene"), true);
476     BOOST_CHECK_EQUAL(s.Match("class: antigen!"), true);
477 }
478 
479 
BOOST_AUTO_TEST_CASE(Test_NADH_dehydrogenase)480 BOOST_AUTO_TEST_CASE(Test_NADH_dehydrogenase)
481 {
482     CString_constraint s;
483     s.SetMatch_text("NADH dehydrogenase subunit 1 gene");
484     s.SetMatch_location(eString_location_equals);
485     s.SetCase_sensitive(false);
486     s.SetIgnore_space(true);
487     s.SetIgnore_punct(true);
488 
489     CRef<CWord_substitution> subst1(new CWord_substitution());
490     subst1->SetWord("NADH dehydrogenase subunit 1 gene");
491     subst1->SetSynonyms().push_back("NADH dehydrogenase subunit 1");
492     subst1->SetSynonyms().push_back("NADH dehydrogenase 1 gene");
493     subst1->SetSynonyms().push_back("NADH dehydrogenase 1");
494     subst1->SetSynonyms().push_back("NADH dehydrogenase subunit 1 protein");
495     subst1->SetSynonyms().push_back("NADH dehydrogenase 1 protein");
496     subst1->SetCase_sensitive(false);
497     subst1->SetWhole_word(false);
498     s.SetIgnore_words().Set().push_back(subst1);
499 
500     CRef<CWord_substitution> subst2(new CWord_substitution());
501     subst2->SetWord("1");
502     subst2->SetSynonyms().push_back("one");
503     subst2->SetCase_sensitive(false);
504     subst2->SetWhole_word(false);
505     s.SetIgnore_words().Set().push_back(subst2);
506 
507     CRef<CWord_substitution> subst3(new CWord_substitution());
508     subst3->SetWord("gene");
509     subst3->SetSynonyms().push_back("sequence");
510     subst3->SetSynonyms().push_back("partial");
511     subst3->SetSynonyms().push_back("complete");
512     subst3->SetSynonyms().push_back("region");
513     subst3->SetSynonyms().push_back("partial sequence");
514     subst3->SetSynonyms().push_back("complete sequence");
515     subst3->SetCase_sensitive(false);
516     subst3->SetWhole_word(false);
517     s.SetIgnore_words().Set().push_back(subst3);
518 
519     s.SetWhole_word(false);
520     s.SetNot_present(false);
521     s.SetIs_all_caps(false);
522     s.SetIs_all_lower(false);
523     s.SetIs_all_punct(false);
524     s.SetIgnore_weasel(false);
525 
526     BOOST_CHECK_EQUAL(s.Match("NADH dehydrogenase subunit one sequence"), true);
527     BOOST_CHECK_EQUAL(s.Match("NADH dehydrogenase subunit 1 gene"), true);
528     BOOST_CHECK_EQUAL(s.Match("NADH dehydrogenase subunit one"), false);
529     BOOST_CHECK_EQUAL(s.Match("NADH dehydrogenase subunit 2 gene"), false);
530     BOOST_CHECK_EQUAL(s.Match("NADH dehydrogenase subunit sequence"), false);
531 }
532 
BOOST_AUTO_TEST_CASE(Test_Beta_actinGene)533 BOOST_AUTO_TEST_CASE(Test_Beta_actinGene)
534 {
535     CString_constraint s;
536     s.SetMatch_text("beta-actin gene");
537     s.SetMatch_location(eString_location_equals);
538     s.SetCase_sensitive(false);
539     s.SetIgnore_space(true);
540     s.SetIgnore_punct(true);
541 
542     CRef<CWord_substitution> subst1(new CWord_substitution());
543     subst1->SetWord("beta-actin gene");
544     subst1->SetSynonyms().push_back("beta-actin");
545     subst1->SetSynonyms().push_back("beta actin");
546     subst1->SetSynonyms().push_back("beta actin gene");
547     subst1->SetSynonyms().push_back("beta_actin");
548     subst1->SetSynonyms().push_back("beta_actin gene");
549     subst1->SetCase_sensitive(false);
550     subst1->SetWhole_word(false);
551     s.SetIgnore_words().Set().push_back(subst1);
552 
553     CRef<CWord_substitution> subst2(new CWord_substitution());
554     subst2->SetWord("gene");
555     subst2->SetSynonyms().push_back("sequence");
556     subst2->SetSynonyms().push_back("partial");
557     subst2->SetSynonyms().push_back("complete");
558     subst2->SetSynonyms().push_back("region");
559     subst2->SetSynonyms().push_back("partial sequence");
560     subst2->SetSynonyms().push_back("complete sequence");
561     subst2->SetCase_sensitive(false);
562     subst2->SetWhole_word(false);
563     s.SetIgnore_words().Set().push_back(subst2);
564 
565     s.SetWhole_word(false);
566     s.SetNot_present(false);
567     s.SetIs_all_caps(false);
568     s.SetIs_all_lower(false);
569     s.SetIs_all_punct(false);
570     s.SetIgnore_weasel(false);
571 
572     BOOST_CHECK_EQUAL(s.Match("beta actin"), true);
573     BOOST_CHECK_EQUAL(s.Match("beta-actin gene"), true);
574     BOOST_CHECK_EQUAL(s.Match("beta_actin sequence"), true);
575 }
576 
BOOST_AUTO_TEST_CASE(Test_FirstCaps)577 BOOST_AUTO_TEST_CASE(Test_FirstCaps)
578 {
579     CString_constraint s;
580     s.SetIs_first_cap(true);
581 
582     BOOST_CHECK_EQUAL(s.Match(""), false);
583     BOOST_CHECK_EQUAL(s.Match("beta actin"), false);
584     BOOST_CHECK_EQUAL(s.Match("beta Actin"), false);
585     BOOST_CHECK_EQUAL(s.Match("bEta actin"), false);
586     BOOST_CHECK_EQUAL(s.Match("BEta actin"), true);
587     BOOST_CHECK_EQUAL(s.Match("Beta-actin Gene"), true);
588     BOOST_CHECK_EQUAL(s.Match("?Beta_Actin Gene"), true);
589     BOOST_CHECK_EQUAL(s.Match("  Beta actin"), true);
590     BOOST_CHECK_EQUAL(s.Match("4"), false);
591     BOOST_CHECK_EQUAL(s.Match("-12Beta"), false);
592 
593     s.SetIs_first_cap(false);
594     s.SetIs_first_each_cap(true);
595 
596     BOOST_CHECK_EQUAL(s.Match(""), false);
597     BOOST_CHECK_EQUAL(s.Match("beta actin"), false);
598     BOOST_CHECK_EQUAL(s.Match("Beta Actin"), true);
599     BOOST_CHECK_EQUAL(s.Match("bEta Actin"), false);
600     BOOST_CHECK_EQUAL(s.Match(" BEta.Actin"), true);
601     BOOST_CHECK_EQUAL(s.Match("Beta-actin Gene"), true); //!!
602     BOOST_CHECK_EQUAL(s.Match("Beta-Actin Gene"), true);
603     BOOST_CHECK_EQUAL(s.Match("Beta_actin Gene"), false);
604     BOOST_CHECK_EQUAL(s.Match("-Beta-actin Gene"), true);
605     BOOST_CHECK_EQUAL(s.Match("?Beta_Actin Gene"), true);
606     BOOST_CHECK_EQUAL(s.Match(" BETA ACTIN"), true);
607     BOOST_CHECK_EQUAL(s.Match("12 Ribosomal RNA"), true);
608     BOOST_CHECK_EQUAL(s.Match("12R Ribosomal RNA"), false); //!!
609     BOOST_CHECK_EQUAL(s.Match("12r Ribosomal RNA"), false); //!!
610 }
611 
BOOST_AUTO_TEST_CASE(Test_Matching_OptionalString)612 BOOST_AUTO_TEST_CASE(Test_Matching_OptionalString)
613 {
614     CString_constraint s;
615     s.SetMatch_text("16S ribosomal RNA gene");
616     s.SetMatch_location(eString_location_equals);
617     s.SetCase_sensitive(false);
618     s.SetIgnore_space(true);
619     s.SetIgnore_punct(true);
620 
621     CRef<CWord_substitution> subst1(new CWord_substitution());
622     subst1->SetWord("");
623     subst1->SetSynonyms().push_back("partial sequence");
624     subst1->SetSynonyms().push_back("complete sequence");
625     subst1->SetSynonyms().push_back("partial");
626     subst1->SetSynonyms().push_back("complete");
627     subst1->SetSynonyms().push_back("gene");
628     subst1->SetSynonyms().push_back("region");
629 
630     subst1->SetCase_sensitive(false);
631     subst1->SetWhole_word(false);
632     s.SetIgnore_words().Set().push_back(subst1);
633 
634     CRef<CWord_substitution> subst2(new CWord_substitution());
635     subst2->SetWord("16S");
636     subst2->SetSynonyms().push_back("5.8S");
637     subst2->SetSynonyms().push_back("12S");
638     subst2->SetSynonyms().push_back("18S");
639     subst2->SetSynonyms().push_back("23S");
640     subst2->SetSynonyms().push_back("28S");
641 
642     subst2->SetCase_sensitive(false);
643     subst2->SetWhole_word(false);
644     s.SetIgnore_words().Set().push_back(subst2);
645 
646     CRef<CWord_substitution> subst3(new CWord_substitution());
647     subst3->SetWord("gene");
648     subst3->SetCase_sensitive(false);
649     subst3->SetWhole_word(false);
650     s.SetIgnore_words().Set().push_back(subst3);
651 
652     s.SetWhole_word(false);
653     s.SetNot_present(false);
654     s.SetIs_all_caps(false);
655     s.SetIs_all_lower(false);
656     s.SetIs_all_punct(false);
657     s.SetIgnore_weasel(false);
658 
659     BOOST_CHECK_EQUAL(s.Match("18S ribosomal RNA gene"), true);
660     BOOST_CHECK_EQUAL(s.Match("18S ribosomal RNA gene, partial sequence"), true);
661 }
662 
BOOST_AUTO_TEST_CASE(Test_Matching_COI)663 BOOST_AUTO_TEST_CASE(Test_Matching_COI)
664 {
665     CString_constraint s;
666     s.SetMatch_text("cytochrome oxidase subunit I (COI)");
667     s.SetMatch_location(eString_location_equals);
668     s.SetCase_sensitive(false);
669     s.SetIgnore_space(true);
670     s.SetIgnore_punct(true);
671 
672     BOOST_CHECK_EQUAL(s.Match("cytochrome oxidase subunit I (COI)"), true);
673 
674     CRef<CWord_substitution> subst1(new CWord_substitution());
675     subst1->SetWord("cytochrome oxidase subunit I (COI)");
676     subst1->SetSynonyms().push_back("cytochrome oxidase subunit I");
677 
678     subst1->SetCase_sensitive(false);
679     subst1->SetWhole_word(false);
680     s.SetIgnore_words().Set().push_back(subst1);
681 
682     BOOST_CHECK_EQUAL(s.Match("cytochrome oxidase subunit I (COI)"), true);
683 }
684 
BOOST_AUTO_TEST_CASE(Test_Matching_Space)685 BOOST_AUTO_TEST_CASE(Test_Matching_Space)
686 {
687     CString_constraint s;
688     s.SetMatch_text(" ");
689     s.SetMatch_location(eString_location_contains);
690     s.SetCase_sensitive(false);
691     s.SetIgnore_space(false);
692     s.SetIgnore_punct(false);
693 
694     BOOST_CHECK_EQUAL(s.Match("Bacillus"), false);
695 }
696 
BOOST_AUTO_TEST_CASE(Test_MatchEnd)697 BOOST_AUTO_TEST_CASE(Test_MatchEnd)
698 {
699     CString_constraint s;
700     s.SetMatch_text("aceae");
701     s.SetMatch_location(eString_location_ends);
702     s.SetCase_sensitive(true);
703 
704     CRef<CWord_substitution> subst1(new CWord_substitution());
705     subst1->SetWord("aceae");
706     subst1->SetSynonyms().push_back("ales");
707     s.SetIgnore_words().Set().push_back(subst1);
708     BOOST_CHECK_EQUAL(s.Match("Methylophilaceae bacterium"), false);
709 
710     BOOST_CHECK_EQUAL(s.Match("Methylophilaceae"), true);
711     BOOST_CHECK_EQUAL(s.Match("bacterium"), false);
712 }
713 
BOOST_AUTO_TEST_CASE(Test_UnwantedMatch1)714 BOOST_AUTO_TEST_CASE(Test_UnwantedMatch1)
715 {
716     CString_constraint s;
717     s.SetMatch_text("RNA-Dependent RNA polymerase");
718     s.SetMatch_location(eString_location_equals);
719     s.SetCase_sensitive(false);
720     s.SetIgnore_space(true);
721     s.SetIgnore_punct(true);
722 
723     BOOST_CHECK_EQUAL(s.Match("NADH dehydrogenase subunit"), false);
724 }
725 
BOOST_AUTO_TEST_CASE(Test_UnwantedMatch2)726 BOOST_AUTO_TEST_CASE(Test_UnwantedMatch2)
727 {
728     CString_constraint s;
729     s.SetMatch_text("Nonstructural protein");
730     s.SetMatch_location(eString_location_equals);
731     s.SetCase_sensitive(false);
732     s.SetIgnore_space(true);
733     s.SetIgnore_punct(true);
734 
735     BOOST_CHECK_EQUAL(s.Match("reverse transcriptase"), false);
736 }
737