1 /*  $Id: xcript_tests.cpp 545584 2017-09-07 17:37:11Z astashya $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Mike DiCuccio, Josh Cherry
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <algo/seqqa/xcript_tests.hpp>
34 #include <corelib/ncbitime.hpp>
35 #include <objects/general/Date.hpp>
36 #include <objects/general/Object_id.hpp>
37 #include <objects/general/User_object.hpp>
38 #include <objects/seq/Bioseq.hpp>
39 #include <objects/seq/MolInfo.hpp>
40 #include <objects/seq/Seq_descr.hpp>
41 #include <objects/seq/Seq_inst.hpp>
42 #include <objects/seq/Seqdesc.hpp>
43 #include <objects/seqtest/Seq_test_result.hpp>
44 #include <objects/seq/Seq_hist.hpp>
45 #include <objects/seq/Seq_hist_rec.hpp>
46 #include <objects/seq/seqport_util.hpp>
47 #include <serial/iterator.hpp>
48 #include <objects/seqfeat/Seq_feat.hpp>
49 #include <objects/seqfeat/SeqFeatData.hpp>
50 #include <objmgr/util/sequence.hpp>
51 #include <objmgr/seqdesc_ci.hpp>
52 #include <objmgr/feat_ci.hpp>
53 #include <objects/seqfeat/Cdregion.hpp>
54 #include <objects/seqfeat/Genetic_code.hpp>
55 #include <objects/seqfeat/Genetic_code_table.hpp>
56 #include <objects/seqfeat/Code_break.hpp>
57 #include <objmgr/seq_vector.hpp>
58 #include <algo/gnomon/gnomon.hpp>
59 #include <algo/sequence/orf.hpp>
60 #include <map>
61 
62 BEGIN_NCBI_SCOPE
63 USING_SCOPE(objects);
64 USING_SCOPE(gnomon);
65 
66 
CanTest(const CSerialObject & obj,const CSeqTestContext * ctx) const67 bool CTestTranscript::CanTest(const CSerialObject& obj,
68                               const CSeqTestContext* ctx) const
69 {
70     const CSeq_id* id = dynamic_cast<const CSeq_id*>(&obj);
71     if (id  &&  ctx) {
72         CBioseq_Handle handle = ctx->GetScope().GetBioseqHandle(*id);
73         return handle.CanGetInst_Mol() && handle.GetInst_Mol() == CSeq_inst::eMol_rna;
74     }
75     return false;
76 }
77 
78 
79 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)80 CTestTranscript_CountCdregions::RunTest(const CSerialObject& obj,
81                                        const CSeqTestContext* ctx)
82 {
83     CRef<CSeq_test_result_set> ref;
84     const CSeq_id* id = dynamic_cast<const CSeq_id*>(&obj);
85     if ( !id  ||  !ctx ) {
86         return ref;
87     }
88 
89     ref.Reset(new CSeq_test_result_set());
90 
91     CRef<CSeq_test_result> result = x_SkeletalTestResult("count_cdregions");
92     ref->Set().push_back(result);
93 
94     SAnnotSelector sel;
95     sel.SetFeatSubtype(CSeqFeatData::eSubtype_cdregion);
96     sel.SetResolveDepth(0);
97     CSeq_loc loc;
98     loc.SetWhole().Assign(*id);
99     CFeat_CI feat_iter(ctx->GetScope(), loc, sel);
100 
101     result->SetOutput_data()
102         .AddField("count", (int) feat_iter.GetSize());
103     return ref;
104 }
105 
106 
107 // A simplistic Kozak strength.  Best is RNNXXXG, where
108 // XXX is the start codon.  This is "strong".  Having just
109 // R or just G is "moderate", and having neither is "weak".
110 enum EKozakStrength
111 {
112     eNone,     // used to indicate that no start has been seen
113     eWeak,
114     eModerate,
115     eStrong
116 };
117 
118 
119 // vec must be set to IUPAC coding
s_GetKozakStrength(const CSeqVector & vec,TSeqPos pos)120 EKozakStrength s_GetKozakStrength(const CSeqVector& vec, TSeqPos pos)
121 {
122     int score = eWeak;
123     if (pos >= 3 &&
124         (vec[pos - 3] == 'A' || vec[pos - 3] == 'G')) {
125         ++score;
126     }
127     if (vec.size() > pos + 3 &&
128         vec[pos + 3] == 'G') {
129         ++score;
130     }
131     return EKozakStrength(score);
132 }
133 
134 
s_KozakStrengthToString(EKozakStrength strength)135 string s_KozakStrengthToString(EKozakStrength strength)
136 {
137     switch (strength) {
138     default:
139     case eNone:
140         return "none";
141 
142     case eWeak:
143         return "weak";
144 
145     case eModerate:
146         return "moderate";
147 
148     case eStrong:
149         return "strong";
150     }
151 }
152 
153 
154 // Return a SeqVector describing the coding regioin, in the
155 // correct orientation, plus the upstream region.
156 // Put the length of the upstream region in upstream_length
s_GetCdregionPlusUpstream(CFeat_CI feat_iter,const CSeqTestContext * ctx,TSeqPos & upstream_length)157 static CSeqVector s_GetCdregionPlusUpstream(CFeat_CI feat_iter,
158                                             const CSeqTestContext* ctx,
159                                             TSeqPos& upstream_length)
160 {
161     CScope& scope = ctx->GetScope();
162     const CSeq_loc& first_cds_loc
163         = CSeq_loc_CI(feat_iter->GetLocation()).GetEmbeddingSeq_loc();
164     CRef<CSeq_loc> upstr(new CSeq_loc);
165     const CSeq_id& id = sequence::GetId(first_cds_loc, 0);
166     upstr->SetInt().SetId().Assign(id);
167     if (sequence::GetStrand(first_cds_loc) == eNa_strand_minus) {
168         upstr->SetInt().SetStrand(eNa_strand_minus);
169         upstr->SetInt().SetFrom(sequence::GetStop(first_cds_loc, 0) + 1);
170         upstr->SetInt().SetTo(sequence::GetLength(id, &scope) - 1);
171     } else {
172         upstr->SetInt().SetFrom(0);
173         upstr->SetInt().SetTo(sequence::GetStart(first_cds_loc, 0) - 1);
174     }
175     CSeq_loc loc;
176     loc.SetMix().AddSeqLoc(*upstr);
177     loc.SetMix().AddSeqLoc(feat_iter->GetLocation());
178     CSeqVector vec(loc, scope);
179     upstream_length = sequence::GetLength(*upstr, 0);
180     return vec;
181 }
182 
183 
184 // If set, return genetic code.  Otherwise return
185 // "standard" genetic code.
s_GetCode(const CCdregion & cdr)186 CConstRef<CGenetic_code> s_GetCode(const CCdregion& cdr)
187 {
188     if (cdr.CanGetCode()) {
189         return CConstRef<CGenetic_code>(&cdr.GetCode());
190     }
191     CRef<CGenetic_code> standard(new CGenetic_code);
192     CRef<CGenetic_code::C_E> code_id(new CGenetic_code::C_E);
193     code_id->SetId(1);
194     standard->Set().push_back(code_id);
195     return standard;
196 }
197 
198 
s_CdsFlags(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)199 static void s_CdsFlags(const CSeq_id& id, const CSeqTestContext* ctx,
200                        CFeat_CI feat_iter, CSeq_test_result& result)
201 {
202     result.SetOutput_data()
203         .AddField("is_partial",
204                   feat_iter->IsSetPartial() && feat_iter->GetPartial());
205     result.SetOutput_data()
206         .AddField("is_pseudo",
207                   feat_iter->IsSetPseudo() && feat_iter->GetPseudo());
208     result.SetOutput_data()
209         .AddField("is_except",
210                   feat_iter->IsSetExcept() && feat_iter->GetExcept());
211 }
212 
213 
214 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)215 CTestTranscript_CdsFlags::RunTest(const CSerialObject& obj,
216                                   const CSeqTestContext* ctx)
217 {
218     return x_TestAllCdregions(obj, ctx, "cds_flags", s_CdsFlags);
219 }
220 
221 
s_InframeUpstreamStart(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)222 static void s_InframeUpstreamStart(const CSeq_id& id,
223                                    const CSeqTestContext* ctx,
224                                    CFeat_CI feat_iter,
225                                    CSeq_test_result& result)
226 {
227     TSeqPos upstream_length;
228     CSeqVector vec =
229         s_GetCdregionPlusUpstream(feat_iter, ctx, upstream_length);
230     vec.SetIupacCoding();
231 
232     EKozakStrength strength;
233     EKozakStrength best_strength = eNone;
234     TSeqPos pos_nearest_best_start = 0; // initialize to avoid compiler warning
235     for (int i = upstream_length - 3;  i >= 0;  i -= 3) {
236         if (vec[i] == 'A' && vec[i + 1] == 'T' && vec[i + 2] == 'G') {
237             strength = s_GetKozakStrength(vec, i);
238             if (strength > best_strength) {
239                 best_strength = strength;
240                 pos_nearest_best_start = i;
241             }
242         }
243     }
244     result.SetOutput_data()
245         .AddField("inframe_upstream_start_exists", best_strength != eNone);
246     if (best_strength != eNone) {
247         result.SetOutput_data()
248             .AddField("inframe_upstream_start_best_kozak_strength",
249                       s_KozakStrengthToString(best_strength));
250         result.SetOutput_data()
251             .AddField("nearest_best_upstream_start_distance",
252                       int(upstream_length - pos_nearest_best_start - 3));
253     }
254 }
255 
256 
257 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)258 CTestTranscript_InframeUpstreamStart::RunTest(const CSerialObject& obj,
259                                               const CSeqTestContext* ctx)
260 {
261     return x_TestAllCdregions(obj, ctx, "inframe_upstream_start",
262                               s_InframeUpstreamStart);
263 }
264 
265 
s_InframeUpstreamStop(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)266 static void s_InframeUpstreamStop(const CSeq_id& id,
267                                   const CSeqTestContext* ctx,
268                                   CFeat_CI feat_iter, CSeq_test_result& result)
269 {
270     CConstRef<CGenetic_code> code =
271         s_GetCode(feat_iter->GetData().GetCdregion());
272     const CTrans_table& tbl = CGen_code_table::GetTransTable(*code);
273 
274     TSeqPos upstream_length;
275     CSeqVector vec =
276         s_GetCdregionPlusUpstream(feat_iter, ctx, upstream_length);
277     vec.SetIupacCoding();
278 
279     for (int i = upstream_length - 3;  i >= 0;  i -= 3) {
280         if (tbl.IsOrfStop(tbl.SetCodonState(vec[i], vec[i + 1],
281                                             vec[i + 2]))) {
282             result.SetOutput_data()
283                 .AddField("inframe_upstream_stop_exists",
284                           true);
285             result.SetOutput_data()
286                 .AddField("nearest_inframe_upstream_stop_distance",
287                           int(upstream_length - i - 3));
288             return;
289         }
290     }
291     result.SetOutput_data()
292         .AddField("inframe_upstream_stop_exists",
293                   false);
294 }
295 
296 
297 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)298 CTestTranscript_InframeUpstreamStop::RunTest(const CSerialObject& obj,
299                                              const CSeqTestContext* ctx)
300 {
301     return x_TestAllCdregions(obj, ctx, "inframe_upstream_stop",
302                               s_InframeUpstreamStop);
303 }
304 
305 
s_CodingPropensity(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)306 static void s_CodingPropensity(const CSeq_id& id, const CSeqTestContext* ctx,
307                                CFeat_CI feat_iter, CSeq_test_result& result)
308 {
309     //creating CHMMParameters object from file is expensive, so
310     //created objects are cached in a static map.
311     //It is reasonable to expect that the number of parameter files
312     //per program is small, and so the cache size does not need
313     //to be limited.
314 
315     static std::map<string, CRef<CHMMParameters> > s_hmmparams_cache;
316     DEFINE_STATIC_FAST_MUTEX(map_mutex);
317 
318     CRef<CHMMParameters> hmm_params;
319 
320     if (!ctx->HasKey("gnomon_model_file")) {
321         return;
322     }
323 
324 
325     string model_file_name = (*ctx)["gnomon_model_file"];
326 
327     {{
328         CFastMutexGuard guard(map_mutex); //released when goes out of scope 4 lines below
329         if(s_hmmparams_cache.find(model_file_name) == s_hmmparams_cache.end()) {
330             CNcbiIfstream model_file(model_file_name.c_str());
331             s_hmmparams_cache[model_file_name] = CRef<CHMMParameters>(new CHMMParameters(model_file));
332         }
333     }}
334 
335     hmm_params = s_hmmparams_cache[model_file_name];
336 
337 
338     const CSeq_loc& cds = feat_iter->GetLocation();
339 
340     int gccontent=0;
341     double score = CCodingPropensity::GetScore(hmm_params, cds,
342                                                ctx->GetScope(), &gccontent);
343 
344     // Record results
345     result.SetOutput_data()
346         .AddField("model_file", model_file_name);
347     result.SetOutput_data()
348         .AddField("model_percent_gc", gccontent);
349     result.SetOutput_data()
350         .AddField("score", max(score, -1e100));
351 
352 }
353 
354 
355 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)356 CTestTranscript_CodingPropensity::RunTest(const CSerialObject& obj,
357                                           const CSeqTestContext* ctx)
358 {
359     return x_TestAllCdregions(obj, ctx, "coding_propensity",
360                               s_CodingPropensity);
361 }
362 
363 
364 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)365 CTestTranscript_TranscriptLength::RunTest(const CSerialObject& obj,
366                                           const CSeqTestContext* ctx)
367 {
368     CRef<CSeq_test_result_set> ref;
369     const CSeq_id* id = dynamic_cast<const CSeq_id*>(&obj);
370     if ( !id  ||  !ctx ) {
371         return ref;
372     }
373 
374     ref.Reset(new CSeq_test_result_set());
375 
376     CRef<CSeq_test_result> result = x_SkeletalTestResult("transcript_length");
377     ref->Set().push_back(result);
378 
379     int len = ctx->GetScope()
380         .GetBioseqHandle(dynamic_cast<const CSeq_id&>(obj)).GetInst_Length();
381     result->SetOutput_data()
382         .AddField("length", len);
383     return ref;
384 }
385 
386 
s_CdsLength(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)387 static void s_CdsLength(const CSeq_id& id, const CSeqTestContext* ctx,
388                         CFeat_CI feat_iter, CSeq_test_result& result)
389 {
390     result.SetOutput_data()
391         .AddField("length",
392                   (int)sequence::GetLength(feat_iter->GetLocation(), 0));
393 }
394 
395 
396 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)397 CTestTranscript_TranscriptCdsLength::RunTest(const CSerialObject& obj,
398                                              const CSeqTestContext* ctx)
399 {
400     return x_TestAllCdregions(obj, ctx, "cds_length", s_CdsLength);
401 }
402 
403 
s_Utrs(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)404 static void s_Utrs(const CSeq_id& id, const CSeqTestContext* ctx,
405                    CFeat_CI feat_iter, CSeq_test_result& result)
406 {
407     const CSeq_loc& loc = feat_iter->GetLocation();
408     TSeqPos cds_from = sequence::GetStart(loc, 0);
409     TSeqPos cds_to   = sequence::GetStop(loc, 0);
410     int xcript_len = ctx->GetScope().GetBioseqHandle(id).GetInst_Length();
411     result.SetOutput_data().AddField("length_5_prime_utr", (int) cds_from);
412     result.SetOutput_data().AddField("length_3_prime_utr",
413                                      (int) (xcript_len - cds_to - 1));
414 }
415 
416 
417 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)418 CTestTranscript_Utrs::RunTest(const CSerialObject& obj,
419                               const CSeqTestContext* ctx)
420 {
421     return x_TestAllCdregions(obj, ctx, "utrs", s_Utrs);
422 }
423 
424 
s_CdsStartCodon(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)425 static void s_CdsStartCodon(const CSeq_id& id, const CSeqTestContext* ctx,
426                             CFeat_CI feat_iter, CSeq_test_result& result)
427 {
428     CConstRef<CGenetic_code> code =
429         s_GetCode(feat_iter->GetData().GetCdregion());
430     const CTrans_table& tbl = CGen_code_table::GetTransTable(*code);
431 
432     TSeqPos upstream_length;
433     CSeqVector vec =
434         s_GetCdregionPlusUpstream(feat_iter, ctx, upstream_length);
435     vec.SetIupacCoding();
436 
437     string seq;
438     vec.GetSeqData(upstream_length, upstream_length + 3, seq);
439     // is this an officially sanctioned start?
440     result.SetOutput_data()
441         .AddField("is_start",
442                   tbl.IsOrfStart(tbl.SetCodonState(seq[0], seq[1], seq[2])));
443     // record it, whatever it is
444     result.SetOutput_data()
445         .AddField("first_codon", seq);
446 
447     result.SetOutput_data()
448         .AddField("kozak_strength",
449                   s_KozakStrengthToString(s_GetKozakStrength(vec, upstream_length)));
450 }
451 
452 
453 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)454 CTestTranscript_CdsStartCodon::RunTest(const CSerialObject& obj,
455                                        const CSeqTestContext* ctx)
456 {
457     return x_TestAllCdregions(obj, ctx, "cds_start_codon", s_CdsStartCodon);
458 }
459 
460 
s_CdsStopCodon(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)461 static void s_CdsStopCodon(const CSeq_id& id, const CSeqTestContext* ctx,
462                            CFeat_CI feat_iter, CSeq_test_result& result)
463 {
464     CConstRef<CGenetic_code> code =
465         s_GetCode(feat_iter->GetData().GetCdregion());
466     const CTrans_table& tbl = CGen_code_table::GetTransTable(*code);
467 
468     CSeqVector vec(feat_iter->GetLocation(), ctx->GetScope());
469     vec.SetIupacCoding();
470     string seq;
471     vec.GetSeqData(vec.size() - 3, vec.size(), seq);
472     result.SetOutput_data()
473         .AddField("is_stop",
474                   tbl.IsOrfStop(tbl.SetCodonState(seq[0], seq[1], seq[2])));
475 }
476 
477 
478 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)479 CTestTranscript_CdsStopCodon::RunTest(const CSerialObject& obj,
480                                       const CSeqTestContext* ctx)
481 {
482     return x_TestAllCdregions(obj, ctx, "cds_stop_codon", s_CdsStopCodon);
483 }
484 
485 
486 // Determine the position in a cds of the start of a Code-break
CodeBreakPosInCds(const CCode_break & code_break,const CSeq_feat & feat,CScope & scope)487 inline TSeqPos CodeBreakPosInCds(const CCode_break& code_break,
488                                  const CSeq_feat& feat, CScope& scope)
489 {
490     return sequence::LocationOffset(feat.GetLocation(), code_break.GetLoc(),
491                                     sequence::eOffset_FromStart, &scope);
492 }
493 
494 
495 // Determine whether a Code-break is a selenocysteine
s_IsSelenocysteine(const CCode_break & code_break)496 static bool s_IsSelenocysteine(const CCode_break& code_break)
497 {
498     switch (code_break.GetAa().Which()) {
499     case CCode_break::TAa::e_Ncbieaa:
500         return code_break.GetAa().GetNcbieaa() == 85;
501     case CCode_break::TAa::e_Ncbi8aa:
502         return code_break.GetAa().GetNcbi8aa() == 24;
503     case CCode_break::TAa::e_Ncbistdaa:
504         return code_break.GetAa().GetNcbistdaa() == 24;
505     case CCode_break::TAa::e_not_set:
506     default:
507         return false;
508     }
509 }
510 
511 
512 // Determine whether a position in a CDS feature is the beginning
513 // of a selenocysteine codon (according to Code-break's)
s_IsSelenocysteine(TSeqPos pos_in_cds,CFeat_CI feat_iter,CScope & scope)514 static bool s_IsSelenocysteine(TSeqPos pos_in_cds, CFeat_CI feat_iter, CScope& scope)
515 {
516     const CSeq_feat& feat = feat_iter->GetOriginalFeature();
517     if (!feat.GetData().GetCdregion().IsSetCode_break()) {
518         return false;
519     }
520     ITERATE (CCdregion::TCode_break, code_break,
521              feat.GetData().GetCdregion().GetCode_break ()) {
522         if (CodeBreakPosInCds(**code_break, feat, scope) == pos_in_cds
523             && s_IsSelenocysteine(**code_break)) {
524             return true;
525         }
526     }
527     return false;
528 }
529 
530 
s_PrematureStopCodon(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)531 static void s_PrematureStopCodon(const CSeq_id& id, const CSeqTestContext* ctx,
532                                  CFeat_CI feat_iter, CSeq_test_result& result)
533 {
534     CConstRef<CGenetic_code> code =
535         s_GetCode(feat_iter->GetData().GetCdregion());
536     const CTrans_table& tbl = CGen_code_table::GetTransTable(*code);
537 
538     CSeqVector vec(feat_iter->GetLocation(), ctx->GetScope());
539     vec.SetIupacCoding();
540 
541     TSeqPos start_translating;
542     switch (feat_iter->GetData().GetCdregion().GetFrame()) {
543     case CCdregion::eFrame_not_set:
544     case CCdregion::eFrame_one:
545         start_translating = 0;
546         break;
547     case CCdregion::eFrame_two:
548         start_translating = 1;
549         break;
550     case CCdregion::eFrame_three:
551         start_translating = 2;
552         break;
553     default:
554         // should never happen, but handle it to avoid compiler warning
555         start_translating = kInvalidSeqPos;
556         break;
557     }
558 
559     bool premature_stop_found = false;
560     for (TSeqPos i = start_translating;  i < vec.size() - 3;  i += 3) {
561         if (tbl.IsOrfStop(tbl.SetCodonState(vec[i], vec[i + 1],
562                                             vec[i + 2]))) {
563             if (!premature_stop_found) {
564                 result.SetOutput_data()
565                     .AddField("has_premature_stop_codon", true);
566                 result.SetOutput_data()
567                     .AddField("first_premature_stop_position",
568                               static_cast<int>(i));
569                 premature_stop_found = true;
570             }
571             // determine whether it's an annotated selenocysteine
572             if (!s_IsSelenocysteine(i, feat_iter, ctx->GetScope())) {
573                 result.SetOutput_data()
574                     .AddField("has_premature_stop_codon_not_sec", true);
575                 result.SetOutput_data()
576                     .AddField("first_premature_stop_position_not_sec",
577                               static_cast<int>(i));
578                 return;
579             }
580         }
581     }
582 
583     result.SetOutput_data()
584         .AddField("has_premature_stop_codon_not_sec", false);
585     if (!premature_stop_found) {
586         result.SetOutput_data()
587             .AddField("has_premature_stop_codon", false);
588     }
589 }
590 
591 
592 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)593 CTestTranscript_PrematureStopCodon::RunTest(const CSerialObject& obj,
594                                             const CSeqTestContext* ctx)
595 {
596     return x_TestAllCdregions(obj, ctx, "premature_stop_codon",
597                               s_PrematureStopCodon);
598 }
599 
600 
601 // Walk the replace history to find the latest revision of a sequence
s_FindLatest(const CSeq_id & id,CScope & scope)602 static CConstRef<CSeq_id> s_FindLatest(const CSeq_id& id, CScope& scope)
603 {
604     CConstRef<CSeq_id> latest = sequence::FindLatestSequence(id, scope);
605     if ( !latest ) {
606         latest.Reset(&id);
607     }
608     return latest;
609 }
610 
611 
s_CompareProtProdToTrans(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)612 static void s_CompareProtProdToTrans(const CSeq_id& id,
613                                      const CSeqTestContext* ctx,
614                                      CFeat_CI feat_iter,
615                                      CSeq_test_result& result)
616 {
617     string translation;
618     CSeqTranslator::Translate(feat_iter->GetOriginalFeature(), ctx->GetScope(),
619                               translation, false /* include_stop */);
620     result.SetOutput_data().AddField("length_translation",
621                                      int(translation.size()));
622 
623     if (!feat_iter->GetOriginalFeature().CanGetProduct()) {
624         // can't do comparison if there's no product annotated
625         return;
626     }
627 
628     const CSeq_loc& prod_loc = feat_iter->GetOriginalFeature().GetProduct();
629     const CSeq_id& prod_id = sequence::GetId(prod_loc, 0);
630     CSeqVector prod_vec(prod_loc, ctx->GetScope());
631     prod_vec.SetIupacCoding();
632 
633     TSeqPos ident_count = 0;
634     for (TSeqPos i = 0;
635          i < min(prod_vec.size(), (TSeqPos)translation.size());  ++i) {
636         if (prod_vec[i] == translation[i]) {
637             ++ident_count;
638         }
639     }
640 
641     result.SetOutput_data().AddField("length_annotated_prot_prod",
642                                      int(prod_vec.size()));
643     result.SetOutput_data()
644         .AddField("fraction_identity",
645                   double(ident_count)
646                   / max(prod_vec.size(), (TSeqPos)translation.size()));
647 
648     CConstRef<CSeq_id> updated_id = s_FindLatest(prod_id, ctx->GetScope());
649     if (updated_id->Equals(prod_id)) {
650         result.SetOutput_data()
651             .AddField("fraction_identity_updated_prot_prod",
652                       double(ident_count)
653                       / max(prod_vec.size(), (TSeqPos)translation.size()));
654         result.SetOutput_data().AddField("length_updated_prot_prod",
655                                          int(prod_vec.size()));
656     } else {
657         CBioseq_Handle updated_prod_hand
658             = ctx->GetScope().GetBioseqHandle(*updated_id);
659         CSeqVector updated_prod_vec = updated_prod_hand.GetSeqVector();
660         updated_prod_vec.SetIupacCoding();
661         TSeqPos ident_count = 0;
662         for (TSeqPos i = 0;
663              i < min(updated_prod_vec.size(), (TSeqPos)translation.size());
664              ++i) {
665             if (updated_prod_vec[i] == translation[i]) {
666                 ++ident_count;
667             }
668         }
669         result.SetOutput_data()
670             .AddField("fraction_identity_updated_prot_prod",
671                       double(ident_count)
672                       / max(updated_prod_vec.size(),
673                             (TSeqPos)translation.size()));
674         result.SetOutput_data().AddField("length_updated_prot_prod",
675                                          int(updated_prod_vec.size()));
676     }
677     result.SetOutput_data()
678         .AddField("prot_prod_updated", !updated_id->Equals(prod_id));
679     result.SetOutput_data()
680         .AddField("updated_prod_id", updated_id->AsFastaString());
681 }
682 
683 
684 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)685 CTestTranscript_CompareProtProdToTrans::RunTest(const CSerialObject& obj,
686                                                 const CSeqTestContext* ctx)
687 {
688     return x_TestAllCdregions(obj, ctx, "compare_prot_prod_to_trans",
689                               s_CompareProtProdToTrans);
690 }
691 
692 
693 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)694 CTestTranscript_PolyA::RunTest(const CSerialObject& obj,
695                                           const CSeqTestContext* ctx)
696 {
697     CRef<CSeq_test_result_set> ref;
698     const CSeq_id* id = dynamic_cast<const CSeq_id*>(&obj);
699     if ( !id  ||  !ctx ) {
700         return ref;
701     }
702 
703     ref.Reset(new CSeq_test_result_set());
704 
705     CRef<CSeq_test_result> result = x_SkeletalTestResult("poly_a");
706     ref->Set().push_back(result);
707 
708     CBioseq_Handle xcript_hand = ctx->GetScope().GetBioseqHandle(*id);
709     CSeqVector vec = xcript_hand.GetSeqVector();
710     vec.SetIupacCoding();
711 
712     //compute trailing a-count
713     {{
714         int pos(0);
715         for(pos = vec.size() - 1;  pos > 0;  --pos) {
716             if (vec[pos] != 'A') {
717                 break;
718             }
719         }
720         result->SetOutput_data().AddField("trailing_a_count",
721                                           int(vec.size() - pos - 1));
722     }}
723 
724 
725     int tail_length(0);
726     //compute tail length allowing for mismatches.
727     //Note: there's similar logic for computing genomic polya priming in alignment tests
728     {{
729         static const int w_match = 1;
730         static const int w_mismatch = -4;
731         static const int x_dropoff = 15;
732 
733         size_t best_pos = NPOS;
734         int best_score = 0;
735         int curr_score = 0;
736 
737         for(size_t curr_pos = vec.size() - 1;
738             curr_pos > 0 && curr_score + x_dropoff > best_score;
739             --curr_pos)
740         {
741             curr_score += vec[curr_pos] == 'A' ? w_match : w_mismatch;
742             if(curr_score >= best_score) {
743                 best_score = curr_score;
744                 best_pos = curr_pos;
745             }
746         }
747         tail_length = (best_pos == NPOS) ? 0 : vec.size() - best_pos;
748         result->SetOutput_data().AddField("tail_length", tail_length);
749     }}
750 
751 
752     //find signal
753     {{
754         static string patterns[] = {
755             "AATAAA",
756             "ATTAAA",
757             "AGTAAA",
758             "TATAAA",
759             "CATAAA",
760             "GATAAA",
761             "AATATA",
762             "AATACA",
763             "AATAGA",
764             "ACTAAA",
765             "AAGAAA",
766             "AATGAA"
767         };
768 
769         size_t window = 50; //serch within 50 bases upstream of polya-site
770         size_t end_pos = vec.size() - 1 - tail_length;
771         size_t begin_pos = end_pos > window ? end_pos - window : 0;
772 
773         string seq;
774         vec.GetSeqData(begin_pos, end_pos, seq);
775 
776         for(int ii = 0; ii < 12; ii++) {
777             size_t pos = NStr::Find(seq, patterns[ii], NStr::eCase, NStr::eReverseSearch);
778             if(pos != NPOS) {
779                 result->SetOutput_data().AddField("signal_pos", static_cast<int>(pos + begin_pos));
780                 result->SetOutput_data().AddField("is_canonical_pas", (ii <= 1)); //AATAAA or ATTAAA
781                 break;
782             }
783         }
784     }}
785 
786     return ref;
787 }
788 
789 
TestStrongKozakUorfs(const CBioseq_Handle bsh,CSeq_test_result & result)790 void TestStrongKozakUorfs(const CBioseq_Handle bsh, CSeq_test_result& result)
791 {
792     TSeqPos cds_start(kInvalidSeqPos);
793     for(CFeat_CI ci(bsh, SAnnotSelector(CSeqFeatData::e_Cdregion)); ci; ++ci) {
794         cds_start = ci->GetLocation().GetStart(eExtreme_Positional);
795         break;
796     }
797     if(cds_start == kInvalidSeqPos) {
798         return;
799     }
800 
801     COrf::TLocVec overlapping_uorfs, upstream_uorfs;
802     COrf::FindStrongKozakUOrfs(bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac), cds_start, overlapping_uorfs, upstream_uorfs);
803     result.SetOutput_data().AddField("overlapping_strong_uorfs", (int)overlapping_uorfs.size());
804     result.SetOutput_data().AddField("upstream_strong_uorfs", (int)upstream_uorfs.size());
805 }
806 
807 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)808 CTestTranscript_Orfs::RunTest(const CSerialObject& obj,
809                               const CSeqTestContext* ctx)
810 {
811     CRef<CSeq_test_result_set> ref;
812     const CSeq_id* id = dynamic_cast<const CSeq_id*>(&obj);
813     if ( !id  ||  !ctx ) {
814         return ref;
815     }
816 
817     ref.Reset(new CSeq_test_result_set());
818 
819     CRef<CSeq_test_result> result = x_SkeletalTestResult("orfs");
820     ref->Set().push_back(result);
821 
822     CBioseq_Handle xcript_hand = ctx->GetScope().GetBioseqHandle(*id);
823     CSeqVector vec = xcript_hand.GetSeqVector();
824     vec.SetIupacCoding();
825 
826     // Look for ORFs starting with any sense codon
827     vector<CRef<CSeq_loc> > orfs;
828     COrf::FindOrfs(vec, orfs);
829     TSeqPos max_orf_length_forward = 0;
830     TSeqPos max_orf_length_either = 0;
831     TSeqPos largest_forward_orf_end = 0;  // intialized to avoid comp. warning
832     ITERATE (vector<CRef<CSeq_loc> >, orf, orfs) {
833         TSeqPos orf_length = sequence::GetLength(**orf, 0);
834         max_orf_length_either = max(max_orf_length_either, orf_length);
835         if ((*orf)->GetInt().GetStrand() != eNa_strand_minus) {
836             if (orf_length > max_orf_length_forward) {
837                 max_orf_length_forward = orf_length;
838                 largest_forward_orf_end = (*orf)->GetInt().GetTo();
839             }
840             max_orf_length_forward = max(max_orf_length_forward, orf_length);
841         }
842     }
843 
844     result->SetOutput_data().AddField("max_orf_length_forward_strand",
845                                       int(max_orf_length_forward));
846     result->SetOutput_data().AddField("largest_forward_orf_end_pos",
847                                       int(largest_forward_orf_end));
848     result->SetOutput_data().AddField("max_orf_length_either_strand",
849                                       int(max_orf_length_either));
850 
851     // Look for ORFs starting with ATG
852     orfs.clear();
853     vector<string> allowable_starts;
854     allowable_starts.push_back("ATG");
855     COrf::FindOrfs(vec, orfs, 3, 1, allowable_starts);
856     max_orf_length_forward = 0;
857     max_orf_length_either = 0;
858     ITERATE (vector<CRef<CSeq_loc> >, orf, orfs) {
859         TSeqPos orf_length = sequence::GetLength(**orf, 0);
860         max_orf_length_either = max(max_orf_length_either, orf_length);
861         if ((*orf)->GetInt().GetStrand() != eNa_strand_minus) {
862             if (orf_length > max_orf_length_forward) {
863                 max_orf_length_forward = orf_length;
864                 largest_forward_orf_end = (*orf)->GetInt().GetTo();
865             }
866             max_orf_length_forward = max(max_orf_length_forward, orf_length);
867         }
868     }
869 
870     result->SetOutput_data().AddField("max_atg_orf_length_forward_strand",
871                                       int(max_orf_length_forward));
872     result->SetOutput_data().AddField("largest_forward_atg_orf_end_pos",
873                                       int(largest_forward_orf_end));
874     result->SetOutput_data().AddField("max_atg_orf_length_either_strand",
875                                       int(max_orf_length_either));
876 
877     TestStrongKozakUorfs(xcript_hand, *result);
878 
879     return ref;
880 }
881 
882 
s_Code_break(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)883 static void s_Code_break(const CSeq_id& id, const CSeqTestContext* ctx,
884                          CFeat_CI feat_iter, CSeq_test_result& result)
885 {
886     int count, not_start_not_sec_count;
887     if (feat_iter->GetData().GetCdregion().IsSetCode_break()) {
888         count = feat_iter->GetData().GetCdregion().GetCode_break().size();
889         not_start_not_sec_count = 0;
890         ITERATE (CCdregion::TCode_break, code_break,
891                  feat_iter->GetData().GetCdregion().GetCode_break()) {
892             TSeqPos pos = CodeBreakPosInCds(**code_break,
893                                             feat_iter->GetOriginalFeature(),
894                                             ctx->GetScope());
895             if (pos != 0 && !s_IsSelenocysteine(**code_break)) {
896                 ++not_start_not_sec_count;
897             }
898         }
899     } else {
900         count = 0;
901         not_start_not_sec_count = 0;
902     }
903 
904     result.SetOutput_data()
905         .AddField("code_break_count", count);
906     result.SetOutput_data()
907         .AddField("code_break_not_start_not_sec_count",
908                   not_start_not_sec_count);
909 }
910 
911 
912 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)913 CTestTranscript_Code_break::RunTest(const CSerialObject& obj,
914                                     const CSeqTestContext* ctx)
915 {
916     return x_TestAllCdregions(obj, ctx, "code_break",
917                               s_Code_break);
918 }
919 
920 
s_OrfExtension(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)921 static void s_OrfExtension(const CSeq_id& id,
922                            const CSeqTestContext* ctx,
923                            CFeat_CI feat_iter,
924                            CSeq_test_result& result)
925 {
926     TSeqPos upstream_length;
927     CSeqVector vec =
928         s_GetCdregionPlusUpstream(feat_iter, ctx, upstream_length);
929     vec.SetIupacCoding();
930 
931     EKozakStrength strength;
932     vector<int> starts(eStrong + 1, upstream_length);
933     string codon;
934     for (int i = upstream_length - 3;  i >= 0;  i -= 3) {
935         vec.GetSeqData(i, i + 3, codon);
936         if (codon == "ATG") {
937             strength = s_GetKozakStrength(vec, i);
938             starts[strength] = i;
939         }
940         if (codon == "TAA" || codon == "TAG" || codon == "TGA") {
941             break;
942         }
943     }
944 
945     // MSS-59
946     // Count the total number of 'ATG' triplets found in the 5'UTR of a
947     // transcript, in frames 1, 2, and 3.
948     int upstream_utr_atg_count(0);
949     for (int i = upstream_length - 3;  i >= 0;  i -= 1) {
950         vec.GetSeqData(i, i + 3, codon);
951         if (codon == "ATG") {
952             upstream_utr_atg_count++;
953         }
954     }
955 
956     result.SetOutput_data()
957         .AddField("max_extension_weak_kozak",
958                   static_cast<int>(upstream_length - starts[eWeak]));
959     result.SetOutput_data()
960         .AddField("max_extension_moderate_kozak",
961                   static_cast<int>(upstream_length - starts[eModerate]));
962     result.SetOutput_data()
963         .AddField("max_extension_strong_kozak",
964                   static_cast<int>(upstream_length - starts[eStrong]));
965     result.SetOutput_data()
966         .AddField("upstream_utr_atg_count",
967                   upstream_utr_atg_count);
968 }
969 
970 
971 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)972 CTestTranscript_OrfExtension::RunTest(const CSerialObject& obj,
973                                               const CSeqTestContext* ctx)
974 {
975     return x_TestAllCdregions(obj, ctx, "orf_extension",
976                               s_OrfExtension);
977 }
978 
979 
s_CountAmbiguities(const CSeqVector & vec)980 static TSeqPos s_CountAmbiguities(const CSeqVector& vec)
981 {
982     CSeqVector vec_copy(vec);
983     vec_copy.SetIupacCoding();
984     string seq;
985     vec_copy.GetSeqData(0, vec_copy.size(), seq);
986 
987     CSeq_data in_seq, out_seq;
988     in_seq.SetIupacna().Set(seq);
989     vector<TSeqPos> out_indices;
990 
991     return CSeqportUtil::GetAmbigs(in_seq, &out_seq, &out_indices);
992 }
993 
994 
s_CdsCountAmbiguities(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)995 static void s_CdsCountAmbiguities(const CSeq_id& id,
996                                   const CSeqTestContext* ctx,
997                                   CFeat_CI feat_iter, CSeq_test_result& result)
998 {
999     CSeqVector vec(feat_iter->GetLocation(), ctx->GetScope());
1000     result.SetOutput_data()
1001         .AddField("cds_ambiguity_count",
1002                   static_cast<int>(s_CountAmbiguities(vec)));
1003 }
1004 
1005 
1006 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)1007 CTestTranscript_CountAmbiguities::RunTest(const CSerialObject& obj,
1008                                           const CSeqTestContext* ctx)
1009 {
1010     CRef<CSeq_test_result_set> rv;
1011     const CSeq_id* id = dynamic_cast<const CSeq_id*>(&obj);
1012     if ( !id  ||  !ctx ) {
1013         return rv;
1014     }
1015 
1016     // count for each coding region
1017     rv = x_TestAllCdregions(obj, ctx, "count_ambiguities",
1018                             s_CdsCountAmbiguities);
1019 
1020     // count for entire transcript
1021     if (!rv) {
1022         rv.Reset(new CSeq_test_result_set());
1023     }
1024     CBioseq_Handle hand = ctx->GetScope().GetBioseqHandle(*id);
1025     CSeqVector vec = hand.GetSeqVector();
1026     CRef<CSeq_test_result> result = x_SkeletalTestResult("count_ambiguities");
1027     rv->Set().push_back(result);
1028     result->SetOutput_data()
1029         .AddField("ambiguity_count",
1030                   static_cast<int>(s_CountAmbiguities(vec)));
1031     return rv;
1032 }
1033 
1034 
1035 END_NCBI_SCOPE
1036