1 /* $Id: unit_test_string_constraint.cpp 571441 2018-09-26 16:07:20Z kachalos $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Colleen Bollin
27 *
28 * File Description:
29 * Simple unit test for CString_constraint.
30 *
31 * ===========================================================================
32 */
33
34 #include <ncbi_pch.hpp>
35
36 #include <objects/macro/String_constraint.hpp>
37 #include <objects/macro/String_location.hpp>
38 #include <objects/macro/Word_substitution.hpp>
39 #include <objects/macro/Word_substitution_set.hpp>
40 #include <objects/macro/Suspect_rule.hpp>
41 #include <objects/macro/Replace_rule.hpp>
42 #include <objects/macro/Replace_func.hpp>
43 #include <objects/macro/Simple_replace.hpp>
44 #include <objects/macro/Search_func.hpp>
45
46 #include <corelib/ncbiapp.hpp>
47 #include <corelib/test_boost.hpp>
48
49 #include <util/util_misc.hpp>
50
51 #include <common/test_assert.h> /* This header must go last */
52
53 USING_NCBI_SCOPE;
54 USING_SCOPE(objects);
55
NCBITEST_AUTO_INIT()56 NCBITEST_AUTO_INIT()
57 {
58 }
59
BOOST_AUTO_TEST_CASE(Test_WordSubstitution)60 BOOST_AUTO_TEST_CASE(Test_WordSubstitution)
61 {
62 CWord_substitution word;
63
64 word.SetWord("fruit");
65 word.SetSynonyms().push_back("apple");
66 word.SetSynonyms().push_back("orange");
67 word.SetSynonyms().push_back("pear");
68 word.SetSynonyms().push_back("grapefruit");
69 word.SetSynonyms().push_back("fruit, canned");
70
71 vector<size_t> match_lens = word.GetMatchLens("fruit, canned", "fruit", 0);
72 BOOST_CHECK_EQUAL(match_lens.size(), 1);
73 BOOST_CHECK_EQUAL(match_lens[0], 13);
74 }
75
76
BOOST_AUTO_TEST_CASE(Test_SimpleConstraints)77 BOOST_AUTO_TEST_CASE(Test_SimpleConstraints)
78 {
79 CString_constraint s;
80
81 s.SetMatch_text("cat");
82 s.SetMatch_location(eString_location_contains);
83
84 BOOST_CHECK_EQUAL(s.Match("cat"), true);
85 BOOST_CHECK_EQUAL(s.Match("catalog"), true);
86 BOOST_CHECK_EQUAL(s.Match("the catalog"), true);
87 BOOST_CHECK_EQUAL(s.Match("ducat"), true);
88 BOOST_CHECK_EQUAL(s.Match("dog"), false);
89 BOOST_CHECK_EQUAL(s.Match("dog, cat, cow"), true);
90
91 s.SetMatch_location(eString_location_equals);
92 BOOST_CHECK_EQUAL(s.Match("cat"), true);
93 BOOST_CHECK_EQUAL(s.Match("catalog"), false);
94 BOOST_CHECK_EQUAL(s.Match("the catalog"), false);
95 BOOST_CHECK_EQUAL(s.Match("ducat"), false);
96 BOOST_CHECK_EQUAL(s.Match("dog"), false);
97 BOOST_CHECK_EQUAL(s.Match("dog, cat, cow"), false);
98
99 s.SetMatch_location(eString_location_starts);
100 BOOST_CHECK_EQUAL(s.Match("cat"), true);
101 BOOST_CHECK_EQUAL(s.Match("catalog"), true);
102 BOOST_CHECK_EQUAL(s.Match("the catalog"), false);
103 BOOST_CHECK_EQUAL(s.Match("ducat"), false);
104 BOOST_CHECK_EQUAL(s.Match("dog"), false);
105 BOOST_CHECK_EQUAL(s.Match("dog, cat, cow"), false);
106
107 s.SetMatch_location(eString_location_ends);
108 BOOST_CHECK_EQUAL(s.Match("cat"), true);
109 BOOST_CHECK_EQUAL(s.Match("catalog"), false);
110 BOOST_CHECK_EQUAL(s.Match("the catalog"), false);
111 BOOST_CHECK_EQUAL(s.Match("ducat"), true);
112 BOOST_CHECK_EQUAL(s.Match("dog"), false);
113 BOOST_CHECK_EQUAL(s.Match("dog, cat, cow"), false);
114
115 // eString_location_inlist - no longer supported
116 //s.SetMatch_location(eString_location_inlist);
117 //BOOST_CHECK_EQUAL(s.Match("cat"), true);
118 //BOOST_CHECK_EQUAL(s.Match("catalog"), false);
119 //BOOST_CHECK_EQUAL(s.Match("the catalog"), false);
120 //BOOST_CHECK_EQUAL(s.Match("ducat"), false);
121 //BOOST_CHECK_EQUAL(s.Match("dog"), false);
122 //BOOST_CHECK_EQUAL(s.Match("dog,cat,cow"), false); // because list is in constraint
123
124 //s.SetMatch_text("dog, cat, cow");
125 //BOOST_CHECK_EQUAL(s.Match("cat"), true);
126 //BOOST_CHECK_EQUAL(s.Match("catalog"), false);
127 //BOOST_CHECK_EQUAL(s.Match("the catalog"), false);
128 //BOOST_CHECK_EQUAL(s.Match("ducat"), false);
129 //BOOST_CHECK_EQUAL(s.Match("dog"), true);
130
131 s.SetMatch_text("dog, cat, cow");
132 s.SetMatch_location(eString_location_contains);
133 s.SetIgnore_punct(true);
134 BOOST_CHECK_EQUAL(s.Match("dog cat cow"), true);
135 BOOST_CHECK_EQUAL(s.Match("dog cat cow"), false);
136 BOOST_CHECK_EQUAL(s.Match("dogcatcow"), false);
137 BOOST_CHECK_EQUAL(s.Match("dog.cat.cow"), false);
138 BOOST_CHECK_EQUAL(s.Match("dog,cat,cow"), false);
139
140 s.SetIgnore_space(true);
141 BOOST_CHECK_EQUAL(s.Match("dog cat cow"), true);
142 BOOST_CHECK_EQUAL(s.Match("dog cat cow"), true);
143 BOOST_CHECK_EQUAL(s.Match("dogcatcow"), true);
144 BOOST_CHECK_EQUAL(s.Match("dog.cat.cow"), true);
145 BOOST_CHECK_EQUAL(s.Match("dog,cat,cow"), true);
146
147 s.ResetIgnore_punct();
148 BOOST_CHECK_EQUAL(s.Match("dog cat cow"), false);
149 BOOST_CHECK_EQUAL(s.Match("dog cat cow"), false);
150 BOOST_CHECK_EQUAL(s.Match("dogcatcow"), false);
151 BOOST_CHECK_EQUAL(s.Match("dog.cat.cow"), false);
152 BOOST_CHECK_EQUAL(s.Match("dog,cat,cow"), true);
153
154 s.Reset();
155 s.SetMatch_text("cat");
156 s.SetWhole_word(true);
157 s.SetMatch_location(eString_location_contains);
158 BOOST_CHECK_EQUAL(s.Match("cat"), true);
159 BOOST_CHECK_EQUAL(s.Match("catalog"), false);
160 BOOST_CHECK_EQUAL(s.Match("the catalog"), false);
161 BOOST_CHECK_EQUAL(s.Match("ducat"), false);
162 BOOST_CHECK_EQUAL(s.Match("dog"), false);
163 BOOST_CHECK_EQUAL(s.Match("dog,cat,cow"), true);
164
165
166
167 string in, out;
168 s.Reset();
169 s.SetMatch_text("cat");
170 s.SetMatch_location(eString_location_contains);
171
172 in = "cat";
173 BOOST_CHECK(s.ReplaceStringConstraintPortionInString(out, in, "dog"));
174 BOOST_CHECK_EQUAL(out, "dog");
175
176 in = "catalog";
177 BOOST_CHECK(s.ReplaceStringConstraintPortionInString(out, in, "dog"));
178 BOOST_CHECK_EQUAL(out, "dogalog");
179
180 in = "the catalog";
181 BOOST_CHECK(s.ReplaceStringConstraintPortionInString(out, in, "dog"));
182 BOOST_CHECK_EQUAL(out, "the dogalog");
183
184 in = "ducat";
185 BOOST_CHECK(s.ReplaceStringConstraintPortionInString(out, in, "dog"));
186 BOOST_CHECK_EQUAL(out, "dudog");
187
188 in = "dog, cat, cow";
189 BOOST_CHECK(s.ReplaceStringConstraintPortionInString(out, in, "dog"));
190 BOOST_CHECK_EQUAL(out, "dog, dog, cow");
191
192 in = "feline";
193 BOOST_CHECK(!s.ReplaceStringConstraintPortionInString(out, in, "dog"));
194 BOOST_CHECK_EQUAL(out, "feline");
195
196 }
197
198
BOOST_AUTO_TEST_CASE(Test_StringConstraintWithSynonyms)199 BOOST_AUTO_TEST_CASE(Test_StringConstraintWithSynonyms)
200 {
201 string text = "The quick brown fox jumped over the lazy dog.";
202
203 CString_constraint s;
204 s.SetMatch_location(eString_location_contains);
205 s.SetMatch_text("dog leaped");
206 CRef<CWord_substitution> subst1(new CWord_substitution("leap", "jump"));
207 s.SetIgnore_words().Set().push_back(subst1);
208 CRef<CWord_substitution> subst2(new CWord_substitution("dog", "fox"));
209 s.SetIgnore_words().Set().push_back(subst2);
210
211 BOOST_CHECK_EQUAL(s.Match(text), true);
212
213 s.Reset();
214 s.SetMatch_location(eString_location_equals);
215 s.SetMatch_text("A fast beige wolf leaped across a sleepy beagle.");
216 CRef<CWord_substitution> article(new CWord_substitution("a", "the"));
217 s.SetIgnore_words().Set().push_back(article);
218 CRef<CWord_substitution> speedy(new CWord_substitution("fast", "quick"));
219 s.SetIgnore_words().Set().push_back(speedy);
220 CRef<CWord_substitution> color(new CWord_substitution("beige", "brown"));
221 s.SetIgnore_words().Set().push_back(color);
222 CRef<CWord_substitution> wild(new CWord_substitution("wolf", "fox"));
223 s.SetIgnore_words().Set().push_back(wild);
224 CRef<CWord_substitution> hop(new CWord_substitution("leap", "jump"));
225 s.SetIgnore_words().Set().push_back(hop);
226 CRef<CWord_substitution> direction(new CWord_substitution("across", "over"));
227 s.SetIgnore_words().Set().push_back(direction);
228 CRef<CWord_substitution> tired(new CWord_substitution("sleepy", "lazy"));
229 s.SetIgnore_words().Set().push_back(tired);
230 CRef<CWord_substitution> tame(new CWord_substitution("beagle", "dog"));
231 s.SetIgnore_words().Set().push_back(tame);
232
233 BOOST_CHECK_EQUAL(s.Match(text), true);
234
235 // won't work if leap is whole word
236 hop->SetWhole_word(true);
237 BOOST_CHECK_EQUAL(s.Match(text), false);
238
239 // won't work if articles are case sensitive
240 hop->SetWhole_word(false);
241 article->SetCase_sensitive(true);
242 BOOST_CHECK_EQUAL(s.Match(text), false);
243
244 }
245
BOOST_AUTO_TEST_CASE(Test_synonyms)246 BOOST_AUTO_TEST_CASE(Test_synonyms)
247 {
248 // string_constraint with ignore-words
249 CString_constraint s;
250 s.SetMatch_text("Homo sapiens");
251 s.SetMatch_location(eString_location_equals);
252 s.SetIgnore_space(true);
253 s.SetIgnore_punct(true);
254
255 CRef <CWord_substitution> word_sub(new CWord_substitution);
256 word_sub->SetWord("Homo sapiens");
257 list <string> syns;
258 syns.push_back("human");
259 syns.push_back("Homo sapien");
260 syns.push_back("Homosapiens");
261 syns.push_back("Homo-sapiens");
262 syns.push_back("Homo spiens");
263 syns.push_back("Homo Sapience");
264 syns.push_back("homosapein");
265 syns.push_back("homosapiens");
266 syns.push_back("homosapien");
267 syns.push_back("homo_sapien");
268 syns.push_back("homo_sapiens");
269 syns.push_back("Homosipian");
270 word_sub->SetSynonyms() = syns;
271 s.SetIgnore_words().Set().push_back(word_sub);
272
273 CRef <CWord_substitution> word_sub2(new CWord_substitution);
274 word_sub2->SetWord("sapiens");
275 syns.clear();
276 syns.push_back("sapien");
277 syns.push_back("sapeins");
278 syns.push_back("sapein");
279 syns.push_back("sapins");
280 syns.push_back("sapens");
281 syns.push_back("sapin");
282 syns.push_back("sapen");
283 syns.push_back("sapians");
284 syns.push_back("sapian");
285 syns.push_back("sapies");
286 syns.push_back("sapie");
287 word_sub2->SetSynonyms() = syns;
288 s.SetIgnore_words().Set().push_back(word_sub2);
289 string test = "human";
290 BOOST_CHECK_EQUAL(s.Match(test), true);
291 test = "humano";
292 BOOST_CHECK_EQUAL(s.Match(test), false);
293 test = "Homo sapien";
294 BOOST_CHECK_EQUAL(s.Match(test), true);
295 test = "Human sapien";
296 BOOST_CHECK_EQUAL(s.Match(test), false);
297 test = "sapien";
298 BOOST_CHECK_EQUAL(s.Match(test), false);
299 }
300
301
BOOST_AUTO_TEST_CASE(Test_SQD_2048)302 BOOST_AUTO_TEST_CASE(Test_SQD_2048)
303 {
304 CString_constraint s;
305 s.SetMatch_text("cytochrome b gene");
306 s.SetMatch_location(eString_location_equals);
307 s.SetCase_sensitive(false);
308 s.SetIgnore_space(true);
309 s.SetIgnore_punct(true);
310
311 CRef<CWord_substitution> subst1(new CWord_substitution());
312 subst1->SetWord("cytochrome b gene");
313 subst1->SetSynonyms().push_back("cytochrome b cytb");
314 subst1->SetSynonyms().push_back("cytochrome b cyt b");
315 subst1->SetSynonyms().push_back("cytochrome b (cytb)");
316 subst1->SetSynonyms().push_back("cytochrome b (cyt b)");
317 subst1->SetCase_sensitive(false);
318 subst1->SetWhole_word(false);
319
320 s.SetIgnore_words().Set().push_back(subst1);
321
322 CRef<CWord_substitution> subst2(new CWord_substitution());
323 subst2->SetWord("gene");
324 subst2->SetSynonyms().push_back("sequence");
325 subst2->SetSynonyms().push_back("partial");
326 subst2->SetSynonyms().push_back("complete");
327 subst2->SetSynonyms().push_back("region");
328 subst2->SetSynonyms().push_back("partial sequence");
329 subst2->SetSynonyms().push_back("complete sequence");
330 subst2->SetCase_sensitive(false);
331 subst2->SetWhole_word(false);
332
333 s.SetIgnore_words().Set().push_back(subst2);
334 s.SetWhole_word(false);
335 s.SetNot_present(false);
336 s.SetIs_all_caps(false);
337 s.SetIs_all_lower(false);
338 s.SetIs_all_punct(false);
339 s.SetIgnore_weasel(false);
340
341 //NcbiCout << MSerial_AsnText << s;
342
343 BOOST_CHECK_EQUAL(s.Match("cytochrome b gene"), true);
344 BOOST_CHECK_EQUAL(s.Match("cytochrome b partial"), true);
345 BOOST_CHECK_EQUAL(s.Match("cytb"), false);
346 }
347
348
BOOST_AUTO_TEST_CASE(Test_SQD_2093)349 BOOST_AUTO_TEST_CASE(Test_SQD_2093)
350 {
351 CSuspect_rule rule;
352
353 rule.SetFind().SetString_constraint().SetMatch_text("localisation");
354 rule.SetFind().SetString_constraint().SetMatch_location(eString_location_contains);
355 rule.SetReplace().SetReplace_func().SetSimple_replace().SetReplace("localization");
356 rule.SetReplace().SetReplace_func().SetSimple_replace().SetWhole_string(false);
357 rule.SetReplace().SetReplace_func().SetSimple_replace().SetWeasel_to_putative(false);
358 rule.SetReplace().SetMove_to_note(false);
359
360 string original = "Localisation of periplasmic protein complexes";
361 BOOST_CHECK_EQUAL(rule.GetFind().Match(original), true);
362 BOOST_CHECK_EQUAL(rule.ApplyToString(original), true);
363 BOOST_CHECK_EQUAL(original, "localization of periplasmic protein complexes");
364
365 }
366
367
BOOST_AUTO_TEST_CASE(Test_CytochromeOxidase)368 BOOST_AUTO_TEST_CASE(Test_CytochromeOxidase)
369 {
370 CString_constraint s;
371 s.SetMatch_text("cytochrome oxidase subunit I gene");
372 s.SetMatch_location(eString_location_equals);
373 s.SetCase_sensitive(false);
374 s.SetIgnore_space(true);
375 s.SetIgnore_punct(true);
376
377 CRef<CWord_substitution> subst1(new CWord_substitution());
378 subst1->SetWord("cytochrome oxidase subunit I gene");
379 subst1->SetSynonyms().push_back("cytochrome oxidase I gene");
380 subst1->SetSynonyms().push_back("cytochrome oxidase I");
381 subst1->SetSynonyms().push_back("cytochrome subunit I");
382 subst1->SetCase_sensitive(false);
383 subst1->SetWhole_word(false);
384
385 s.SetIgnore_words().Set().push_back(subst1);
386
387 CRef<CWord_substitution> subst2(new CWord_substitution());
388 subst2->SetWord("gene");
389 subst2->SetCase_sensitive(false);
390 subst2->SetWhole_word(false);
391 s.SetIgnore_words().Set().push_back(subst2);
392
393 CRef<CWord_substitution> subst3(new CWord_substitution());
394 subst3->SetWord("gene");
395 /* Instead of having subst2, we can add the line below to subst3, the effect is the same
396 * subst3->SetSynonyms().push_back(kEmptyStr);
397 */
398 subst3->SetSynonyms().push_back("sequence");
399 subst3->SetSynonyms().push_back("partial");
400 subst3->SetSynonyms().push_back("complete");
401 subst3->SetSynonyms().push_back("region");
402 subst3->SetSynonyms().push_back("partial sequence");
403 subst3->SetSynonyms().push_back("complete sequence");
404 subst3->SetCase_sensitive(false);
405 subst3->SetWhole_word(false);
406 s.SetIgnore_words().Set().push_back(subst3);
407
408 CRef<CWord_substitution> subst4(new CWord_substitution());
409 subst4->SetWord("oxidase");
410 subst4->SetSynonyms().push_back("oxydase");
411 subst4->SetCase_sensitive(false);
412 subst4->SetWhole_word(false);
413 s.SetIgnore_words().Set().push_back(subst4);
414
415 s.SetWhole_word(false);
416 s.SetNot_present(false);
417 s.SetIs_all_caps(false);
418 s.SetIs_all_lower(false);
419 s.SetIs_all_punct(false);
420 s.SetIgnore_weasel(false);
421
422 BOOST_CHECK_EQUAL(s.Match("cytochrome oxidase subunit I"), true);
423 BOOST_CHECK_EQUAL(s.Match("cytochrome oxydase subunit I"), true);
424 BOOST_CHECK_EQUAL(s.Match("cytochrome oxydase subunit I gene"), true);
425 }
426
BOOST_AUTO_TEST_CASE(Test_AntigenGene)427 BOOST_AUTO_TEST_CASE(Test_AntigenGene)
428 {
429 CString_constraint s;
430 s.SetMatch_text("MHC CLASS II ANTIGEN gene");
431 s.SetMatch_location(eString_location_equals);
432 s.SetCase_sensitive(false);
433 s.SetIgnore_space(true);
434 s.SetIgnore_punct(true);
435
436 CRef<CWord_substitution> subst2(new CWord_substitution());
437 subst2->SetWord("gene");
438 subst2->SetSynonyms().push_back("sequence");
439 subst2->SetSynonyms().push_back("partial");
440 subst2->SetSynonyms().push_back("complete");
441 subst2->SetSynonyms().push_back("region");
442 subst2->SetSynonyms().push_back("partial sequence");
443 subst2->SetSynonyms().push_back("complete sequence");
444 subst2->SetCase_sensitive(false);
445 subst2->SetWhole_word(false);
446 s.SetIgnore_words().Set().push_back(subst2);
447
448
449 s.SetWhole_word(false);
450 s.SetNot_present(false);
451 s.SetIs_all_caps(false);
452 s.SetIs_all_lower(false);
453 s.SetIs_all_punct(false);
454 s.SetIgnore_weasel(false);
455
456 BOOST_CHECK_EQUAL(s.Match("MHC CLASS II ANTIGEN gene"), true);
457 BOOST_CHECK_EQUAL(s.Match("MHC class II antigen gene"), true);
458 }
459
BOOST_AUTO_TEST_CASE(Test_Upper_LowerCases)460 BOOST_AUTO_TEST_CASE(Test_Upper_LowerCases)
461 {
462 CString_constraint s;
463 s.SetIs_all_caps(true);
464
465 BOOST_CHECK_EQUAL(s.Match("MHC CLASS ii ANTIGEN gene"), false);
466 BOOST_CHECK_EQUAL(s.Match("ANTIGEN"), true);
467 BOOST_CHECK_EQUAL(s.Match("ANTIGEN GENE"), true);
468 BOOST_CHECK_EQUAL(s.Match("CLASS: ANTIGEN"), true);
469
470 s.SetIs_all_caps(false);
471 s.SetIs_all_lower(true);
472
473 BOOST_CHECK_EQUAL(s.Match("MHC CLASS ii ANTIGEN gene"), false);
474 BOOST_CHECK_EQUAL(s.Match("antigen"), true);
475 BOOST_CHECK_EQUAL(s.Match("antigen gene"), true);
476 BOOST_CHECK_EQUAL(s.Match("class: antigen!"), true);
477 }
478
479
BOOST_AUTO_TEST_CASE(Test_NADH_dehydrogenase)480 BOOST_AUTO_TEST_CASE(Test_NADH_dehydrogenase)
481 {
482 CString_constraint s;
483 s.SetMatch_text("NADH dehydrogenase subunit 1 gene");
484 s.SetMatch_location(eString_location_equals);
485 s.SetCase_sensitive(false);
486 s.SetIgnore_space(true);
487 s.SetIgnore_punct(true);
488
489 CRef<CWord_substitution> subst1(new CWord_substitution());
490 subst1->SetWord("NADH dehydrogenase subunit 1 gene");
491 subst1->SetSynonyms().push_back("NADH dehydrogenase subunit 1");
492 subst1->SetSynonyms().push_back("NADH dehydrogenase 1 gene");
493 subst1->SetSynonyms().push_back("NADH dehydrogenase 1");
494 subst1->SetSynonyms().push_back("NADH dehydrogenase subunit 1 protein");
495 subst1->SetSynonyms().push_back("NADH dehydrogenase 1 protein");
496 subst1->SetCase_sensitive(false);
497 subst1->SetWhole_word(false);
498 s.SetIgnore_words().Set().push_back(subst1);
499
500 CRef<CWord_substitution> subst2(new CWord_substitution());
501 subst2->SetWord("1");
502 subst2->SetSynonyms().push_back("one");
503 subst2->SetCase_sensitive(false);
504 subst2->SetWhole_word(false);
505 s.SetIgnore_words().Set().push_back(subst2);
506
507 CRef<CWord_substitution> subst3(new CWord_substitution());
508 subst3->SetWord("gene");
509 subst3->SetSynonyms().push_back("sequence");
510 subst3->SetSynonyms().push_back("partial");
511 subst3->SetSynonyms().push_back("complete");
512 subst3->SetSynonyms().push_back("region");
513 subst3->SetSynonyms().push_back("partial sequence");
514 subst3->SetSynonyms().push_back("complete sequence");
515 subst3->SetCase_sensitive(false);
516 subst3->SetWhole_word(false);
517 s.SetIgnore_words().Set().push_back(subst3);
518
519 s.SetWhole_word(false);
520 s.SetNot_present(false);
521 s.SetIs_all_caps(false);
522 s.SetIs_all_lower(false);
523 s.SetIs_all_punct(false);
524 s.SetIgnore_weasel(false);
525
526 BOOST_CHECK_EQUAL(s.Match("NADH dehydrogenase subunit one sequence"), true);
527 BOOST_CHECK_EQUAL(s.Match("NADH dehydrogenase subunit 1 gene"), true);
528 BOOST_CHECK_EQUAL(s.Match("NADH dehydrogenase subunit one"), false);
529 BOOST_CHECK_EQUAL(s.Match("NADH dehydrogenase subunit 2 gene"), false);
530 BOOST_CHECK_EQUAL(s.Match("NADH dehydrogenase subunit sequence"), false);
531 }
532
BOOST_AUTO_TEST_CASE(Test_Beta_actinGene)533 BOOST_AUTO_TEST_CASE(Test_Beta_actinGene)
534 {
535 CString_constraint s;
536 s.SetMatch_text("beta-actin gene");
537 s.SetMatch_location(eString_location_equals);
538 s.SetCase_sensitive(false);
539 s.SetIgnore_space(true);
540 s.SetIgnore_punct(true);
541
542 CRef<CWord_substitution> subst1(new CWord_substitution());
543 subst1->SetWord("beta-actin gene");
544 subst1->SetSynonyms().push_back("beta-actin");
545 subst1->SetSynonyms().push_back("beta actin");
546 subst1->SetSynonyms().push_back("beta actin gene");
547 subst1->SetSynonyms().push_back("beta_actin");
548 subst1->SetSynonyms().push_back("beta_actin gene");
549 subst1->SetCase_sensitive(false);
550 subst1->SetWhole_word(false);
551 s.SetIgnore_words().Set().push_back(subst1);
552
553 CRef<CWord_substitution> subst2(new CWord_substitution());
554 subst2->SetWord("gene");
555 subst2->SetSynonyms().push_back("sequence");
556 subst2->SetSynonyms().push_back("partial");
557 subst2->SetSynonyms().push_back("complete");
558 subst2->SetSynonyms().push_back("region");
559 subst2->SetSynonyms().push_back("partial sequence");
560 subst2->SetSynonyms().push_back("complete sequence");
561 subst2->SetCase_sensitive(false);
562 subst2->SetWhole_word(false);
563 s.SetIgnore_words().Set().push_back(subst2);
564
565 s.SetWhole_word(false);
566 s.SetNot_present(false);
567 s.SetIs_all_caps(false);
568 s.SetIs_all_lower(false);
569 s.SetIs_all_punct(false);
570 s.SetIgnore_weasel(false);
571
572 BOOST_CHECK_EQUAL(s.Match("beta actin"), true);
573 BOOST_CHECK_EQUAL(s.Match("beta-actin gene"), true);
574 BOOST_CHECK_EQUAL(s.Match("beta_actin sequence"), true);
575 }
576
BOOST_AUTO_TEST_CASE(Test_FirstCaps)577 BOOST_AUTO_TEST_CASE(Test_FirstCaps)
578 {
579 CString_constraint s;
580 s.SetIs_first_cap(true);
581
582 BOOST_CHECK_EQUAL(s.Match(""), false);
583 BOOST_CHECK_EQUAL(s.Match("beta actin"), false);
584 BOOST_CHECK_EQUAL(s.Match("beta Actin"), false);
585 BOOST_CHECK_EQUAL(s.Match("bEta actin"), false);
586 BOOST_CHECK_EQUAL(s.Match("BEta actin"), true);
587 BOOST_CHECK_EQUAL(s.Match("Beta-actin Gene"), true);
588 BOOST_CHECK_EQUAL(s.Match("?Beta_Actin Gene"), true);
589 BOOST_CHECK_EQUAL(s.Match(" Beta actin"), true);
590 BOOST_CHECK_EQUAL(s.Match("4"), false);
591 BOOST_CHECK_EQUAL(s.Match("-12Beta"), false);
592
593 s.SetIs_first_cap(false);
594 s.SetIs_first_each_cap(true);
595
596 BOOST_CHECK_EQUAL(s.Match(""), false);
597 BOOST_CHECK_EQUAL(s.Match("beta actin"), false);
598 BOOST_CHECK_EQUAL(s.Match("Beta Actin"), true);
599 BOOST_CHECK_EQUAL(s.Match("bEta Actin"), false);
600 BOOST_CHECK_EQUAL(s.Match(" BEta.Actin"), true);
601 BOOST_CHECK_EQUAL(s.Match("Beta-actin Gene"), true); //!!
602 BOOST_CHECK_EQUAL(s.Match("Beta-Actin Gene"), true);
603 BOOST_CHECK_EQUAL(s.Match("Beta_actin Gene"), false);
604 BOOST_CHECK_EQUAL(s.Match("-Beta-actin Gene"), true);
605 BOOST_CHECK_EQUAL(s.Match("?Beta_Actin Gene"), true);
606 BOOST_CHECK_EQUAL(s.Match(" BETA ACTIN"), true);
607 BOOST_CHECK_EQUAL(s.Match("12 Ribosomal RNA"), true);
608 BOOST_CHECK_EQUAL(s.Match("12R Ribosomal RNA"), false); //!!
609 BOOST_CHECK_EQUAL(s.Match("12r Ribosomal RNA"), false); //!!
610 }
611
BOOST_AUTO_TEST_CASE(Test_Matching_OptionalString)612 BOOST_AUTO_TEST_CASE(Test_Matching_OptionalString)
613 {
614 CString_constraint s;
615 s.SetMatch_text("16S ribosomal RNA gene");
616 s.SetMatch_location(eString_location_equals);
617 s.SetCase_sensitive(false);
618 s.SetIgnore_space(true);
619 s.SetIgnore_punct(true);
620
621 CRef<CWord_substitution> subst1(new CWord_substitution());
622 subst1->SetWord("");
623 subst1->SetSynonyms().push_back("partial sequence");
624 subst1->SetSynonyms().push_back("complete sequence");
625 subst1->SetSynonyms().push_back("partial");
626 subst1->SetSynonyms().push_back("complete");
627 subst1->SetSynonyms().push_back("gene");
628 subst1->SetSynonyms().push_back("region");
629
630 subst1->SetCase_sensitive(false);
631 subst1->SetWhole_word(false);
632 s.SetIgnore_words().Set().push_back(subst1);
633
634 CRef<CWord_substitution> subst2(new CWord_substitution());
635 subst2->SetWord("16S");
636 subst2->SetSynonyms().push_back("5.8S");
637 subst2->SetSynonyms().push_back("12S");
638 subst2->SetSynonyms().push_back("18S");
639 subst2->SetSynonyms().push_back("23S");
640 subst2->SetSynonyms().push_back("28S");
641
642 subst2->SetCase_sensitive(false);
643 subst2->SetWhole_word(false);
644 s.SetIgnore_words().Set().push_back(subst2);
645
646 CRef<CWord_substitution> subst3(new CWord_substitution());
647 subst3->SetWord("gene");
648 subst3->SetCase_sensitive(false);
649 subst3->SetWhole_word(false);
650 s.SetIgnore_words().Set().push_back(subst3);
651
652 s.SetWhole_word(false);
653 s.SetNot_present(false);
654 s.SetIs_all_caps(false);
655 s.SetIs_all_lower(false);
656 s.SetIs_all_punct(false);
657 s.SetIgnore_weasel(false);
658
659 BOOST_CHECK_EQUAL(s.Match("18S ribosomal RNA gene"), true);
660 BOOST_CHECK_EQUAL(s.Match("18S ribosomal RNA gene, partial sequence"), true);
661 }
662
BOOST_AUTO_TEST_CASE(Test_Matching_COI)663 BOOST_AUTO_TEST_CASE(Test_Matching_COI)
664 {
665 CString_constraint s;
666 s.SetMatch_text("cytochrome oxidase subunit I (COI)");
667 s.SetMatch_location(eString_location_equals);
668 s.SetCase_sensitive(false);
669 s.SetIgnore_space(true);
670 s.SetIgnore_punct(true);
671
672 BOOST_CHECK_EQUAL(s.Match("cytochrome oxidase subunit I (COI)"), true);
673
674 CRef<CWord_substitution> subst1(new CWord_substitution());
675 subst1->SetWord("cytochrome oxidase subunit I (COI)");
676 subst1->SetSynonyms().push_back("cytochrome oxidase subunit I");
677
678 subst1->SetCase_sensitive(false);
679 subst1->SetWhole_word(false);
680 s.SetIgnore_words().Set().push_back(subst1);
681
682 BOOST_CHECK_EQUAL(s.Match("cytochrome oxidase subunit I (COI)"), true);
683 }
684
BOOST_AUTO_TEST_CASE(Test_Matching_Space)685 BOOST_AUTO_TEST_CASE(Test_Matching_Space)
686 {
687 CString_constraint s;
688 s.SetMatch_text(" ");
689 s.SetMatch_location(eString_location_contains);
690 s.SetCase_sensitive(false);
691 s.SetIgnore_space(false);
692 s.SetIgnore_punct(false);
693
694 BOOST_CHECK_EQUAL(s.Match("Bacillus"), false);
695 }
696
BOOST_AUTO_TEST_CASE(Test_MatchEnd)697 BOOST_AUTO_TEST_CASE(Test_MatchEnd)
698 {
699 CString_constraint s;
700 s.SetMatch_text("aceae");
701 s.SetMatch_location(eString_location_ends);
702 s.SetCase_sensitive(true);
703
704 CRef<CWord_substitution> subst1(new CWord_substitution());
705 subst1->SetWord("aceae");
706 subst1->SetSynonyms().push_back("ales");
707 s.SetIgnore_words().Set().push_back(subst1);
708 BOOST_CHECK_EQUAL(s.Match("Methylophilaceae bacterium"), false);
709
710 BOOST_CHECK_EQUAL(s.Match("Methylophilaceae"), true);
711 BOOST_CHECK_EQUAL(s.Match("bacterium"), false);
712 }
713
BOOST_AUTO_TEST_CASE(Test_UnwantedMatch1)714 BOOST_AUTO_TEST_CASE(Test_UnwantedMatch1)
715 {
716 CString_constraint s;
717 s.SetMatch_text("RNA-Dependent RNA polymerase");
718 s.SetMatch_location(eString_location_equals);
719 s.SetCase_sensitive(false);
720 s.SetIgnore_space(true);
721 s.SetIgnore_punct(true);
722
723 BOOST_CHECK_EQUAL(s.Match("NADH dehydrogenase subunit"), false);
724 }
725
BOOST_AUTO_TEST_CASE(Test_UnwantedMatch2)726 BOOST_AUTO_TEST_CASE(Test_UnwantedMatch2)
727 {
728 CString_constraint s;
729 s.SetMatch_text("Nonstructural protein");
730 s.SetMatch_location(eString_location_equals);
731 s.SetCase_sensitive(false);
732 s.SetIgnore_space(true);
733 s.SetIgnore_punct(true);
734
735 BOOST_CHECK_EQUAL(s.Match("reverse transcriptase"), false);
736 }
737