1 /* $Id: unit_test_internal_stops.cpp 631487 2021-05-19 13:44:58Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Mike DiCuccio
27 *
28 * File Description:
29 *
30 * ===========================================================================
31 */
32
33 #include <ncbi_pch.hpp>
34
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbiargs.hpp>
37 #include <corelib/ncbienv.hpp>
38 #include <corelib/test_boost.hpp>
39
40 #include <objmgr/object_manager.hpp>
41 #include <objtools/data_loaders/genbank/gbloader.hpp>
42 #include <objmgr/scope.hpp>
43
44 #include <serial/serial.hpp>
45 #include <serial/objistr.hpp>
46 #include <serial/objostr.hpp>
47
48 #include <algo/sequence/internal_stops.hpp>
49
50 USING_NCBI_SCOPE;
51 USING_SCOPE(objects);
52
NCBITEST_INIT_CMDLINE(arg_desc)53 NCBITEST_INIT_CMDLINE(arg_desc)
54 {
55 // Here we make descriptions of command line parameters that we are
56 // going to use.
57
58 }
59
NCBITEST_AUTO_INIT()60 NCBITEST_AUTO_INIT()
61 {
62 CRef<CObjectManager> om = CObjectManager::GetInstance();
63 CGBDataLoader::RegisterInObjectManager(*om);
64 }
65
BOOST_AUTO_TEST_CASE(TestProtein)66 BOOST_AUTO_TEST_CASE(TestProtein)
67 {
68 CRef<CObjectManager> om = CObjectManager::GetInstance();
69 CScope scope(*om);
70 scope.AddDefaults();
71
72 string buf = " \
73 Seq-align ::= { \
74 type disc, \
75 dim 2, \
76 segs spliced { \
77 product-id gi 148225248, \
78 genomic-id gi 224514980, \
79 genomic-strand plus, \
80 product-type protein, \
81 exons { \
82 { \
83 product-start protpos { \
84 amin 22, \
85 frame 1 \
86 }, \
87 product-end protpos { \
88 amin 277, \
89 frame 3 \
90 }, \
91 genomic-start 30641728, \
92 genomic-end 30642468, \
93 parts { \
94 diag 69, \
95 product-ins 3, \
96 diag 30, \
97 product-ins 4, \
98 diag 494, \
99 product-ins 2, \
100 diag 85, \
101 product-ins 18, \
102 diag 63 \
103 }, \
104 partial TRUE \
105 } \
106 }, \
107 product-length 278, \
108 modifiers { \
109 stop-codon-found TRUE \
110 } \
111 } \
112 } \
113 ";
114
115 CNcbiIstrstream istrs(buf);
116
117 unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
118
119 CSeq_align align;
120 *istr >> align;
121
122 BOOST_CHECK_NO_THROW(align.Validate(true));
123
124 CInternalStopFinder int_stop_finder(scope);
125
126 set<TSeqPos> stops = int_stop_finder.FindStops(align);
127
128 BOOST_CHECK_EQUAL( stops.size(), 2U );
129 }
130
131 /*
132 BOOST_AUTO_TEST_CASE(TestMRNA)
133 {
134 CRef<CObjectManager> om = CObjectManager::GetInstance();
135 CScope scope(*om);
136 scope.AddDefaults();
137
138 string buf = " \
139 Seq-align ::= { \
140 type disc, \
141 dim 2, \
142 segs spliced { \
143 product-id gi 178405, \
144 genomic-id gi 224514917, \
145 product-strand plus, \
146 genomic-strand minus, \
147 product-type transcript, \
148 exons { \
149 { \
150 product-start nucpos 0, \
151 product-end nucpos 2083, \
152 genomic-start 78159147, \
153 genomic-end 78161233, \
154 parts { \
155 match 159, \
156 genomic-ins 1, \
157 match 25, \
158 genomic-ins 1, \
159 match 47, \
160 product-ins 1, \
161 match 361, \
162 genomic-ins 2, \
163 match 1491 \
164 } \
165 } \
166 }, \
167 product-length 2084 \
168 } \
169 } \
170 ";
171
172 CNcbiIstrstream istrs(buf.c_str());
173
174 unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
175
176 CSeq_align align;
177 *istr >> align;
178
179 BOOST_CHECK_NO_THROW(align.Validate(true));
180
181 CInternalStopFinder int_stop_finder(scope);
182
183 set<TSeqPos> stops = int_stop_finder.FindStops(align);
184
185 BOOST_CHECK_EQUAL( stops.size(), 9U );
186 }
187 */
188
189
BOOST_AUTO_TEST_CASE(TestStartStopProteinWithPadding)190 BOOST_AUTO_TEST_CASE(TestStartStopProteinWithPadding)
191 {
192 CRef<CObjectManager> om = CObjectManager::GetInstance();
193 CScope scope(*om);
194 scope.AddDefaults();
195
196 string buf = " \
197 Seq-align ::= { \
198 type disc, \
199 dim 2, \
200 segs spliced { \
201 product-id gi 148225248, \
202 genomic-id gi 224514980, \
203 genomic-strand plus, \
204 product-type protein, \
205 exons { \
206 { \
207 product-start protpos { \
208 amin 22, \
209 frame 1 \
210 }, \
211 product-end protpos { \
212 amin 277, \
213 frame 3 \
214 }, \
215 genomic-start 30641728, \
216 genomic-end 30642468, \
217 parts { \
218 diag 69, \
219 product-ins 3, \
220 diag 30, \
221 product-ins 4, \
222 diag 494, \
223 product-ins 2, \
224 diag 85, \
225 product-ins 18, \
226 diag 63 \
227 }, \
228 partial TRUE \
229 } \
230 }, \
231 product-length 278, \
232 modifiers { \
233 stop-codon-found TRUE \
234 } \
235 } \
236 } \
237 ";
238
239 CNcbiIstrstream istrs(buf);
240
241 unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
242
243 CSeq_align align;
244 *istr >> align;
245
246 BOOST_CHECK_NO_THROW(align.Validate(true));
247
248 CInternalStopFinder int_stop_finder(scope);
249
250 pair<set<TSeqPos>, set<TSeqPos> > starts_stops = int_stop_finder.FindStartsStops(align,3);
251
252 BOOST_CHECK_EQUAL( starts_stops.first.size(), 10U );
253 BOOST_CHECK_EQUAL( starts_stops.second.size(), 3U );
254 }
255
BOOST_AUTO_TEST_CASE(TestStartAcrossTheOrigin)256 BOOST_AUTO_TEST_CASE(TestStartAcrossTheOrigin)
257 {
258 CRef<CObjectManager> om = CObjectManager::GetInstance();
259 CScope scope(*om);
260 scope.AddDefaults();
261
262 string buf = " \
263 Seq-align ::= { \
264 type disc, \
265 dim 2, \
266 segs spliced { \
267 product-id gi 488735231, \
268 genomic-id gi 6382081, \
269 genomic-strand plus, \
270 product-type protein, \
271 exons { \
272 { \
273 product-start protpos { \
274 amin 2, \
275 frame 3 \
276 }, \
277 product-end protpos { \
278 amin 430, \
279 frame 3 \
280 }, \
281 genomic-start 1, \
282 genomic-end 1285, \
283 parts { \
284 diag 1285 \
285 }, \
286 partial FALSE \
287 } \
288 }, \
289 product-length 431, \
290 modifiers { \
291 stop-codon-found TRUE \
292 } \
293 } \
294 } \
295 ";
296
297 CNcbiIstrstream istrs(buf);
298
299 unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
300
301 CSeq_align align;
302 *istr >> align;
303
304 BOOST_CHECK_NO_THROW(align.Validate(true));
305
306 CInternalStopFinder int_stop_finder(scope);
307
308 pair<set<TSeqPos>, set<TSeqPos> > starts_stops = int_stop_finder.FindStartsStops(align,11);
309
310 BOOST_CHECK_EQUAL( *starts_stops.first.rbegin(), 30740U );
311 }
312
BOOST_AUTO_TEST_CASE(TestStopInGap)313 BOOST_AUTO_TEST_CASE(TestStopInGap)
314 {
315 CRef<CObjectManager> om = CObjectManager::GetInstance();
316 CScope scope(*om);
317 scope.AddDefaults();
318
319 string buf = " \
320 Seq-align ::= { \
321 type disc, \
322 dim 2, \
323 segs spliced { \
324 product-id gi 487809918, \
325 genomic-id gi 341576043, \
326 genomic-strand minus, \
327 product-type protein, \
328 exons { \
329 { \
330 product-start protpos { \
331 amin 10, \
332 frame 1 \
333 }, \
334 product-end protpos { \
335 amin 165, \
336 frame 3 \
337 }, \
338 genomic-start 84235, \
339 genomic-end 84618, \
340 parts { \
341 diag 6, \
342 genomic-ins 6, \
343 diag 48, \
344 product-ins 3, \
345 diag 12, \
346 product-ins 15, \
347 diag 18, \
348 product-ins 72, \
349 diag 294 \
350 }, \
351 partial TRUE \
352 } \
353 }, \
354 product-length 166, \
355 modifiers { \
356 start-codon-found TRUE, \
357 stop-codon-found TRUE \
358 } \
359 } \
360 } \
361 ";
362
363 CNcbiIstrstream istrs(buf);
364
365 unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
366
367 CSeq_align align;
368 *istr >> align;
369
370 BOOST_CHECK_NO_THROW(align.Validate(true));
371
372 CInternalStopFinder int_stop_finder(scope);
373
374 CFeatureGenerator fg(scope);
375 fg.SetFlags(CFeatureGenerator::fMaximizeTranslation);
376 CConstRef<CSeq_align> clean_alignment = fg.CleanAlignment(align);
377 set<TSeqPos> stops = int_stop_finder.FindStops(*clean_alignment);
378
379 BOOST_CHECK_EQUAL( *stops.begin(), 84609U );
380 }
381
BOOST_AUTO_TEST_CASE(TestAltStarts)382 BOOST_AUTO_TEST_CASE(TestAltStarts)
383 {
384 CRef<CObjectManager> om = CObjectManager::GetInstance();
385 CScope scope(*om);
386 scope.AddDefaults();
387
388 string buf = " \
389 Seq-align ::= { \
390 type disc, \
391 dim 2, \
392 segs spliced { \
393 product-id gi 148225248, \
394 genomic-id gi 224514980, \
395 genomic-strand plus, \
396 product-type protein, \
397 exons { \
398 { \
399 product-start protpos { \
400 amin 22, \
401 frame 1 \
402 }, \
403 product-end protpos { \
404 amin 277, \
405 frame 3 \
406 }, \
407 genomic-start 30641728, \
408 genomic-end 30642468, \
409 parts { \
410 diag 69, \
411 product-ins 3, \
412 diag 30, \
413 product-ins 4, \
414 diag 494, \
415 product-ins 2, \
416 diag 85, \
417 product-ins 18, \
418 diag 63 \
419 }, \
420 partial TRUE \
421 } \
422 }, \
423 product-length 278, \
424 modifiers { \
425 stop-codon-found TRUE \
426 } \
427 } \
428 } \
429 ";
430
431 CNcbiIstrstream istrs(buf);
432
433 unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
434
435 CSeq_align align;
436 *istr >> align;
437
438 BOOST_CHECK_NO_THROW(align.Validate(true));
439
440 CInternalStopFinder int_stop_finder(scope);
441
442 typedef map<TSeqRange, string> TStarts;
443 TStarts starts = int_stop_finder.FindStartStopRanges(align,3).first;
444
445 int atg_starts = 0;
446 ITERATE(TStarts, s, starts) {
447 if (s->second == "ATG")
448 ++atg_starts;
449 }
450
451 BOOST_CHECK_EQUAL( atg_starts, 3 );
452 }
453
BOOST_AUTO_TEST_CASE(TestAtCircularEnd)454 BOOST_AUTO_TEST_CASE(TestAtCircularEnd)
455 {
456 CRef<CObjectManager> om = CObjectManager::GetInstance();
457 CScope scope(*om);
458 scope.AddDefaults();
459
460 string buf = " \
461 Seq-align ::= { \
462 type disc, \
463 dim 2, \
464 segs spliced { \
465 product-id gi 490246827, \
466 genomic-id gi 364515570, \
467 genomic-strand minus, \
468 product-type protein, \
469 exons { \
470 { \
471 product-start protpos { \
472 amin 0, \
473 frame 1 \
474 }, \
475 product-end protpos { \
476 amin 628, \
477 frame 3 \
478 }, \
479 genomic-start 5332055, \
480 genomic-end 5333941, \
481 parts { \
482 match 3, \
483 diag 1884 \
484 } \
485 } \
486 }, \
487 product-length 629, \
488 modifiers { \
489 start-codon-found TRUE, \
490 stop-codon-found TRUE \
491 } \
492 } \
493 } \
494 ";
495
496 CNcbiIstrstream istrs(buf);
497
498 unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
499
500 CSeq_align align;
501 *istr >> align;
502
503 BOOST_CHECK_NO_THROW(align.Validate(true));
504
505 CInternalStopFinder int_stop_finder(scope);
506
507 for (int pad=33; pad > 30; --pad) {
508 auto starts_stops_ranges = int_stop_finder.FindStartStopRanges(align, pad);
509
510 BOOST_CHECK( starts_stops_ranges.first.find(TSeqRange(5333941,5333939)) != starts_stops_ranges.first.end() );
511 BOOST_CHECK_EQUAL( starts_stops_ranges.first[TSeqRange(5333941,5333939)], "ATG" );
512 }
513 }
514
BOOST_AUTO_TEST_CASE(TestReportGaps)515 BOOST_AUTO_TEST_CASE(TestReportGaps)
516 {
517 CRef<CObjectManager> om = CObjectManager::GetInstance();
518 CScope scope(*om);
519 scope.AddDefaults();
520
521 string buf = " \
522 Seq-align ::= { \
523 type disc, \
524 dim 2, \
525 segs spliced { \
526 product-id gi 487427171, \
527 genomic-id gi 357958168, \
528 genomic-strand plus, \
529 product-type protein, \
530 exons { \
531 { \
532 product-start protpos { \
533 amin 3, \
534 frame 1 \
535 }, \
536 product-end protpos { \
537 amin 96, \
538 frame 3 \
539 }, \
540 genomic-start 31, \
541 genomic-end 312, \
542 parts { \
543 diag 282 \
544 }, \
545 partial TRUE \
546 } \
547 }, \
548 product-length 97 \
549 } \
550 } \
551 ";
552
553 CNcbiIstrstream istrs(buf);
554
555 unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
556
557 CSeq_align align;
558 *istr >> align;
559
560 BOOST_CHECK_NO_THROW(align.Validate(true));
561
562 CInternalStopFinder int_stop_finder(scope);
563
564 set<TSignedSeqRange> gaps;
565 auto starts_stops = int_stop_finder.FindStartStopRanges(align, 1100, &gaps);
566
567 BOOST_CHECK_EQUAL( gaps.size(), 3U );
568 auto it = gaps.begin();
569 BOOST_CHECK_EQUAL( it->GetFrom(), -10 );
570 BOOST_CHECK_EQUAL( it->GetTo(), -1 );
571 ++it;
572 BOOST_CHECK_EQUAL( it->GetFrom(), 26 );
573 BOOST_CHECK_EQUAL( it->GetTo(), 28 );
574 ++it;
575 BOOST_CHECK_EQUAL( it->GetFrom(), 1405 );
576 BOOST_CHECK_EQUAL( it->GetTo(), 1414 );
577 }
578