1 /*  $Id: unit_test_internal_stops.cpp 631487 2021-05-19 13:44:58Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Mike DiCuccio
27  *
28  * File Description:
29  *
30  * ===========================================================================
31  */
32 
33 #include <ncbi_pch.hpp>
34 
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbiargs.hpp>
37 #include <corelib/ncbienv.hpp>
38 #include <corelib/test_boost.hpp>
39 
40 #include <objmgr/object_manager.hpp>
41 #include <objtools/data_loaders/genbank/gbloader.hpp>
42 #include <objmgr/scope.hpp>
43 
44 #include <serial/serial.hpp>
45 #include <serial/objistr.hpp>
46 #include <serial/objostr.hpp>
47 
48 #include <algo/sequence/internal_stops.hpp>
49 
50 USING_NCBI_SCOPE;
51 USING_SCOPE(objects);
52 
NCBITEST_INIT_CMDLINE(arg_desc)53 NCBITEST_INIT_CMDLINE(arg_desc)
54 {
55     // Here we make descriptions of command line parameters that we are
56     // going to use.
57 
58 }
59 
NCBITEST_AUTO_INIT()60 NCBITEST_AUTO_INIT()
61 {
62     CRef<CObjectManager> om = CObjectManager::GetInstance();
63     CGBDataLoader::RegisterInObjectManager(*om);
64 }
65 
BOOST_AUTO_TEST_CASE(TestProtein)66 BOOST_AUTO_TEST_CASE(TestProtein)
67 {
68     CRef<CObjectManager> om = CObjectManager::GetInstance();
69     CScope scope(*om);
70     scope.AddDefaults();
71 
72 string buf = " \
73 Seq-align ::= { \
74   type disc, \
75   dim 2, \
76   segs spliced { \
77     product-id gi 148225248, \
78     genomic-id gi 224514980, \
79     genomic-strand plus, \
80     product-type protein, \
81     exons { \
82       { \
83         product-start protpos { \
84           amin 22, \
85           frame 1 \
86         }, \
87         product-end protpos { \
88           amin 277, \
89           frame 3 \
90         }, \
91         genomic-start 30641728, \
92         genomic-end 30642468, \
93         parts { \
94           diag 69, \
95           product-ins 3, \
96           diag 30, \
97           product-ins 4, \
98           diag 494, \
99           product-ins 2, \
100           diag 85, \
101           product-ins 18, \
102           diag 63 \
103         }, \
104         partial TRUE \
105       } \
106     }, \
107     product-length 278, \
108     modifiers { \
109       stop-codon-found TRUE \
110     } \
111   } \
112 } \
113 ";
114 
115     CNcbiIstrstream istrs(buf);
116 
117     unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
118 
119     CSeq_align align;
120     *istr >> align;
121 
122     BOOST_CHECK_NO_THROW(align.Validate(true));
123 
124     CInternalStopFinder int_stop_finder(scope);
125 
126     set<TSeqPos> stops = int_stop_finder.FindStops(align);
127 
128     BOOST_CHECK_EQUAL( stops.size(), 2U );
129 }
130 
131 /*
132 BOOST_AUTO_TEST_CASE(TestMRNA)
133 {
134     CRef<CObjectManager> om = CObjectManager::GetInstance();
135     CScope scope(*om);
136     scope.AddDefaults();
137 
138 string buf = " \
139 Seq-align ::= { \
140   type disc, \
141   dim 2, \
142   segs spliced { \
143     product-id gi 178405, \
144     genomic-id gi 224514917, \
145     product-strand plus, \
146     genomic-strand minus, \
147     product-type transcript, \
148     exons { \
149       { \
150         product-start nucpos 0, \
151         product-end nucpos 2083, \
152         genomic-start 78159147, \
153         genomic-end 78161233, \
154         parts { \
155           match 159, \
156           genomic-ins 1, \
157           match 25, \
158           genomic-ins 1, \
159           match 47, \
160           product-ins 1, \
161           match 361, \
162           genomic-ins 2, \
163           match 1491 \
164         } \
165       } \
166     }, \
167     product-length 2084 \
168   } \
169 } \
170 ";
171 
172     CNcbiIstrstream istrs(buf.c_str());
173 
174     unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
175 
176     CSeq_align align;
177     *istr >> align;
178 
179     BOOST_CHECK_NO_THROW(align.Validate(true));
180 
181     CInternalStopFinder int_stop_finder(scope);
182 
183     set<TSeqPos> stops = int_stop_finder.FindStops(align);
184 
185     BOOST_CHECK_EQUAL( stops.size(), 9U );
186 }
187 */
188 
189 
BOOST_AUTO_TEST_CASE(TestStartStopProteinWithPadding)190 BOOST_AUTO_TEST_CASE(TestStartStopProteinWithPadding)
191 {
192     CRef<CObjectManager> om = CObjectManager::GetInstance();
193     CScope scope(*om);
194     scope.AddDefaults();
195 
196 string buf = " \
197 Seq-align ::= { \
198   type disc, \
199   dim 2, \
200   segs spliced { \
201     product-id gi 148225248, \
202     genomic-id gi 224514980, \
203     genomic-strand plus, \
204     product-type protein, \
205     exons { \
206       { \
207         product-start protpos { \
208           amin 22, \
209           frame 1 \
210         }, \
211         product-end protpos { \
212           amin 277, \
213           frame 3 \
214         }, \
215         genomic-start 30641728, \
216         genomic-end 30642468, \
217         parts { \
218           diag 69, \
219           product-ins 3, \
220           diag 30, \
221           product-ins 4, \
222           diag 494, \
223           product-ins 2, \
224           diag 85, \
225           product-ins 18, \
226           diag 63 \
227         }, \
228         partial TRUE \
229       } \
230     }, \
231     product-length 278, \
232     modifiers { \
233       stop-codon-found TRUE \
234     } \
235   } \
236 } \
237 ";
238 
239     CNcbiIstrstream istrs(buf);
240 
241     unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
242 
243     CSeq_align align;
244     *istr >> align;
245 
246     BOOST_CHECK_NO_THROW(align.Validate(true));
247 
248     CInternalStopFinder int_stop_finder(scope);
249 
250     pair<set<TSeqPos>, set<TSeqPos> > starts_stops = int_stop_finder.FindStartsStops(align,3);
251 
252     BOOST_CHECK_EQUAL( starts_stops.first.size(), 10U );
253     BOOST_CHECK_EQUAL( starts_stops.second.size(), 3U );
254 }
255 
BOOST_AUTO_TEST_CASE(TestStartAcrossTheOrigin)256 BOOST_AUTO_TEST_CASE(TestStartAcrossTheOrigin)
257 {
258     CRef<CObjectManager> om = CObjectManager::GetInstance();
259     CScope scope(*om);
260     scope.AddDefaults();
261 
262 string buf = " \
263 Seq-align ::= { \
264   type disc, \
265   dim 2, \
266   segs spliced { \
267     product-id gi 488735231, \
268     genomic-id gi 6382081, \
269     genomic-strand plus, \
270     product-type protein, \
271     exons { \
272       { \
273         product-start protpos { \
274           amin 2, \
275           frame 3 \
276         }, \
277         product-end protpos { \
278           amin 430, \
279           frame 3 \
280         }, \
281         genomic-start 1, \
282         genomic-end 1285, \
283         parts { \
284           diag 1285 \
285         }, \
286         partial FALSE \
287       } \
288     }, \
289     product-length 431, \
290     modifiers { \
291       stop-codon-found TRUE \
292     } \
293   } \
294 } \
295 ";
296 
297     CNcbiIstrstream istrs(buf);
298 
299     unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
300 
301     CSeq_align align;
302     *istr >> align;
303 
304     BOOST_CHECK_NO_THROW(align.Validate(true));
305 
306     CInternalStopFinder int_stop_finder(scope);
307 
308     pair<set<TSeqPos>, set<TSeqPos> > starts_stops = int_stop_finder.FindStartsStops(align,11);
309 
310     BOOST_CHECK_EQUAL( *starts_stops.first.rbegin(), 30740U );
311 }
312 
BOOST_AUTO_TEST_CASE(TestStopInGap)313 BOOST_AUTO_TEST_CASE(TestStopInGap)
314 {
315     CRef<CObjectManager> om = CObjectManager::GetInstance();
316     CScope scope(*om);
317     scope.AddDefaults();
318 
319 string buf = " \
320 Seq-align ::= { \
321   type disc, \
322   dim 2, \
323   segs spliced { \
324     product-id gi 487809918, \
325     genomic-id gi 341576043, \
326     genomic-strand minus, \
327     product-type protein, \
328     exons { \
329       { \
330         product-start protpos { \
331           amin 10, \
332           frame 1 \
333         }, \
334         product-end protpos { \
335           amin 165, \
336           frame 3 \
337         }, \
338         genomic-start 84235, \
339         genomic-end 84618, \
340         parts { \
341           diag 6, \
342           genomic-ins 6, \
343           diag 48, \
344           product-ins 3, \
345           diag 12, \
346           product-ins 15, \
347           diag 18, \
348           product-ins 72, \
349           diag 294 \
350         }, \
351         partial TRUE \
352       } \
353     }, \
354     product-length 166, \
355     modifiers { \
356       start-codon-found TRUE, \
357       stop-codon-found TRUE \
358     } \
359   } \
360 } \
361 ";
362 
363     CNcbiIstrstream istrs(buf);
364 
365     unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
366 
367     CSeq_align align;
368     *istr >> align;
369 
370     BOOST_CHECK_NO_THROW(align.Validate(true));
371 
372     CInternalStopFinder int_stop_finder(scope);
373 
374     CFeatureGenerator fg(scope);
375     fg.SetFlags(CFeatureGenerator::fMaximizeTranslation);
376     CConstRef<CSeq_align> clean_alignment = fg.CleanAlignment(align);
377     set<TSeqPos> stops = int_stop_finder.FindStops(*clean_alignment);
378 
379     BOOST_CHECK_EQUAL( *stops.begin(), 84609U );
380 }
381 
BOOST_AUTO_TEST_CASE(TestAltStarts)382 BOOST_AUTO_TEST_CASE(TestAltStarts)
383 {
384     CRef<CObjectManager> om = CObjectManager::GetInstance();
385     CScope scope(*om);
386     scope.AddDefaults();
387 
388 string buf = " \
389 Seq-align ::= { \
390   type disc, \
391   dim 2, \
392   segs spliced { \
393     product-id gi 148225248, \
394     genomic-id gi 224514980, \
395     genomic-strand plus, \
396     product-type protein, \
397     exons { \
398       { \
399         product-start protpos { \
400           amin 22, \
401           frame 1 \
402         }, \
403         product-end protpos { \
404           amin 277, \
405           frame 3 \
406         }, \
407         genomic-start 30641728, \
408         genomic-end 30642468, \
409         parts { \
410           diag 69, \
411           product-ins 3, \
412           diag 30, \
413           product-ins 4, \
414           diag 494, \
415           product-ins 2, \
416           diag 85, \
417           product-ins 18, \
418           diag 63 \
419         }, \
420         partial TRUE \
421       } \
422     }, \
423     product-length 278, \
424     modifiers { \
425       stop-codon-found TRUE \
426     } \
427   } \
428 } \
429 ";
430 
431     CNcbiIstrstream istrs(buf);
432 
433     unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
434 
435     CSeq_align align;
436     *istr >> align;
437 
438     BOOST_CHECK_NO_THROW(align.Validate(true));
439 
440     CInternalStopFinder int_stop_finder(scope);
441 
442     typedef map<TSeqRange, string> TStarts;
443     TStarts starts = int_stop_finder.FindStartStopRanges(align,3).first;
444 
445     int atg_starts = 0;
446     ITERATE(TStarts, s, starts) {
447         if (s->second == "ATG")
448             ++atg_starts;
449     }
450 
451     BOOST_CHECK_EQUAL( atg_starts, 3 );
452 }
453 
BOOST_AUTO_TEST_CASE(TestAtCircularEnd)454 BOOST_AUTO_TEST_CASE(TestAtCircularEnd)
455 {
456     CRef<CObjectManager> om = CObjectManager::GetInstance();
457     CScope scope(*om);
458     scope.AddDefaults();
459 
460 string buf = " \
461 Seq-align ::= { \
462   type disc, \
463   dim 2, \
464   segs spliced { \
465     product-id gi 490246827, \
466     genomic-id gi 364515570, \
467     genomic-strand minus, \
468     product-type protein, \
469     exons { \
470       { \
471         product-start protpos { \
472           amin 0, \
473           frame 1 \
474         }, \
475         product-end protpos { \
476           amin 628, \
477           frame 3 \
478         }, \
479         genomic-start 5332055, \
480         genomic-end 5333941, \
481         parts { \
482           match 3, \
483           diag 1884 \
484         } \
485       } \
486     }, \
487     product-length 629, \
488     modifiers { \
489       start-codon-found TRUE, \
490       stop-codon-found TRUE \
491     } \
492   } \
493 } \
494 ";
495 
496     CNcbiIstrstream istrs(buf);
497 
498     unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
499 
500     CSeq_align align;
501     *istr >> align;
502 
503     BOOST_CHECK_NO_THROW(align.Validate(true));
504 
505     CInternalStopFinder int_stop_finder(scope);
506 
507     for (int pad=33; pad > 30; --pad) {
508         auto starts_stops_ranges = int_stop_finder.FindStartStopRanges(align, pad);
509 
510         BOOST_CHECK( starts_stops_ranges.first.find(TSeqRange(5333941,5333939)) != starts_stops_ranges.first.end() );
511         BOOST_CHECK_EQUAL( starts_stops_ranges.first[TSeqRange(5333941,5333939)], "ATG" );
512     }
513 }
514 
BOOST_AUTO_TEST_CASE(TestReportGaps)515 BOOST_AUTO_TEST_CASE(TestReportGaps)
516 {
517     CRef<CObjectManager> om = CObjectManager::GetInstance();
518     CScope scope(*om);
519     scope.AddDefaults();
520 
521 string buf = " \
522 Seq-align ::= { \
523   type disc, \
524   dim 2, \
525   segs spliced { \
526     product-id gi 487427171, \
527     genomic-id gi 357958168, \
528     genomic-strand plus, \
529     product-type protein, \
530     exons { \
531       { \
532         product-start protpos { \
533           amin 3, \
534           frame 1 \
535         }, \
536         product-end protpos { \
537           amin 96, \
538           frame 3 \
539         }, \
540         genomic-start 31, \
541         genomic-end 312, \
542         parts { \
543           diag 282 \
544         }, \
545         partial TRUE \
546       } \
547     }, \
548     product-length 97 \
549   } \
550 } \
551 ";
552 
553     CNcbiIstrstream istrs(buf);
554 
555     unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
556 
557     CSeq_align align;
558     *istr >> align;
559 
560     BOOST_CHECK_NO_THROW(align.Validate(true));
561 
562     CInternalStopFinder int_stop_finder(scope);
563 
564     set<TSignedSeqRange> gaps;
565     auto starts_stops = int_stop_finder.FindStartStopRanges(align, 1100, &gaps);
566 
567     BOOST_CHECK_EQUAL( gaps.size(), 3U );
568     auto it = gaps.begin();
569     BOOST_CHECK_EQUAL( it->GetFrom(), -10 );
570     BOOST_CHECK_EQUAL( it->GetTo(),   -1 );
571     ++it;
572     BOOST_CHECK_EQUAL( it->GetFrom(), 26 );
573     BOOST_CHECK_EQUAL( it->GetTo(),   28 );
574     ++it;
575     BOOST_CHECK_EQUAL( it->GetFrom(), 1405 );
576     BOOST_CHECK_EQUAL( it->GetTo(),   1414 );
577 }
578