1 /* $Id: xcript_tests.cpp 545584 2017-09-07 17:37:11Z astashya $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Mike DiCuccio, Josh Cherry
27 *
28 * File Description:
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <algo/seqqa/xcript_tests.hpp>
34 #include <corelib/ncbitime.hpp>
35 #include <objects/general/Date.hpp>
36 #include <objects/general/Object_id.hpp>
37 #include <objects/general/User_object.hpp>
38 #include <objects/seq/Bioseq.hpp>
39 #include <objects/seq/MolInfo.hpp>
40 #include <objects/seq/Seq_descr.hpp>
41 #include <objects/seq/Seq_inst.hpp>
42 #include <objects/seq/Seqdesc.hpp>
43 #include <objects/seqtest/Seq_test_result.hpp>
44 #include <objects/seq/Seq_hist.hpp>
45 #include <objects/seq/Seq_hist_rec.hpp>
46 #include <objects/seq/seqport_util.hpp>
47 #include <serial/iterator.hpp>
48 #include <objects/seqfeat/Seq_feat.hpp>
49 #include <objects/seqfeat/SeqFeatData.hpp>
50 #include <objmgr/util/sequence.hpp>
51 #include <objmgr/seqdesc_ci.hpp>
52 #include <objmgr/feat_ci.hpp>
53 #include <objects/seqfeat/Cdregion.hpp>
54 #include <objects/seqfeat/Genetic_code.hpp>
55 #include <objects/seqfeat/Genetic_code_table.hpp>
56 #include <objects/seqfeat/Code_break.hpp>
57 #include <objmgr/seq_vector.hpp>
58 #include <algo/gnomon/gnomon.hpp>
59 #include <algo/sequence/orf.hpp>
60 #include <map>
61
62 BEGIN_NCBI_SCOPE
63 USING_SCOPE(objects);
64 USING_SCOPE(gnomon);
65
66
CanTest(const CSerialObject & obj,const CSeqTestContext * ctx) const67 bool CTestTranscript::CanTest(const CSerialObject& obj,
68 const CSeqTestContext* ctx) const
69 {
70 const CSeq_id* id = dynamic_cast<const CSeq_id*>(&obj);
71 if (id && ctx) {
72 CBioseq_Handle handle = ctx->GetScope().GetBioseqHandle(*id);
73 return handle.CanGetInst_Mol() && handle.GetInst_Mol() == CSeq_inst::eMol_rna;
74 }
75 return false;
76 }
77
78
79 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)80 CTestTranscript_CountCdregions::RunTest(const CSerialObject& obj,
81 const CSeqTestContext* ctx)
82 {
83 CRef<CSeq_test_result_set> ref;
84 const CSeq_id* id = dynamic_cast<const CSeq_id*>(&obj);
85 if ( !id || !ctx ) {
86 return ref;
87 }
88
89 ref.Reset(new CSeq_test_result_set());
90
91 CRef<CSeq_test_result> result = x_SkeletalTestResult("count_cdregions");
92 ref->Set().push_back(result);
93
94 SAnnotSelector sel;
95 sel.SetFeatSubtype(CSeqFeatData::eSubtype_cdregion);
96 sel.SetResolveDepth(0);
97 CSeq_loc loc;
98 loc.SetWhole().Assign(*id);
99 CFeat_CI feat_iter(ctx->GetScope(), loc, sel);
100
101 result->SetOutput_data()
102 .AddField("count", (int) feat_iter.GetSize());
103 return ref;
104 }
105
106
107 // A simplistic Kozak strength. Best is RNNXXXG, where
108 // XXX is the start codon. This is "strong". Having just
109 // R or just G is "moderate", and having neither is "weak".
110 enum EKozakStrength
111 {
112 eNone, // used to indicate that no start has been seen
113 eWeak,
114 eModerate,
115 eStrong
116 };
117
118
119 // vec must be set to IUPAC coding
s_GetKozakStrength(const CSeqVector & vec,TSeqPos pos)120 EKozakStrength s_GetKozakStrength(const CSeqVector& vec, TSeqPos pos)
121 {
122 int score = eWeak;
123 if (pos >= 3 &&
124 (vec[pos - 3] == 'A' || vec[pos - 3] == 'G')) {
125 ++score;
126 }
127 if (vec.size() > pos + 3 &&
128 vec[pos + 3] == 'G') {
129 ++score;
130 }
131 return EKozakStrength(score);
132 }
133
134
s_KozakStrengthToString(EKozakStrength strength)135 string s_KozakStrengthToString(EKozakStrength strength)
136 {
137 switch (strength) {
138 default:
139 case eNone:
140 return "none";
141
142 case eWeak:
143 return "weak";
144
145 case eModerate:
146 return "moderate";
147
148 case eStrong:
149 return "strong";
150 }
151 }
152
153
154 // Return a SeqVector describing the coding regioin, in the
155 // correct orientation, plus the upstream region.
156 // Put the length of the upstream region in upstream_length
s_GetCdregionPlusUpstream(CFeat_CI feat_iter,const CSeqTestContext * ctx,TSeqPos & upstream_length)157 static CSeqVector s_GetCdregionPlusUpstream(CFeat_CI feat_iter,
158 const CSeqTestContext* ctx,
159 TSeqPos& upstream_length)
160 {
161 CScope& scope = ctx->GetScope();
162 const CSeq_loc& first_cds_loc
163 = CSeq_loc_CI(feat_iter->GetLocation()).GetEmbeddingSeq_loc();
164 CRef<CSeq_loc> upstr(new CSeq_loc);
165 const CSeq_id& id = sequence::GetId(first_cds_loc, 0);
166 upstr->SetInt().SetId().Assign(id);
167 if (sequence::GetStrand(first_cds_loc) == eNa_strand_minus) {
168 upstr->SetInt().SetStrand(eNa_strand_minus);
169 upstr->SetInt().SetFrom(sequence::GetStop(first_cds_loc, 0) + 1);
170 upstr->SetInt().SetTo(sequence::GetLength(id, &scope) - 1);
171 } else {
172 upstr->SetInt().SetFrom(0);
173 upstr->SetInt().SetTo(sequence::GetStart(first_cds_loc, 0) - 1);
174 }
175 CSeq_loc loc;
176 loc.SetMix().AddSeqLoc(*upstr);
177 loc.SetMix().AddSeqLoc(feat_iter->GetLocation());
178 CSeqVector vec(loc, scope);
179 upstream_length = sequence::GetLength(*upstr, 0);
180 return vec;
181 }
182
183
184 // If set, return genetic code. Otherwise return
185 // "standard" genetic code.
s_GetCode(const CCdregion & cdr)186 CConstRef<CGenetic_code> s_GetCode(const CCdregion& cdr)
187 {
188 if (cdr.CanGetCode()) {
189 return CConstRef<CGenetic_code>(&cdr.GetCode());
190 }
191 CRef<CGenetic_code> standard(new CGenetic_code);
192 CRef<CGenetic_code::C_E> code_id(new CGenetic_code::C_E);
193 code_id->SetId(1);
194 standard->Set().push_back(code_id);
195 return standard;
196 }
197
198
s_CdsFlags(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)199 static void s_CdsFlags(const CSeq_id& id, const CSeqTestContext* ctx,
200 CFeat_CI feat_iter, CSeq_test_result& result)
201 {
202 result.SetOutput_data()
203 .AddField("is_partial",
204 feat_iter->IsSetPartial() && feat_iter->GetPartial());
205 result.SetOutput_data()
206 .AddField("is_pseudo",
207 feat_iter->IsSetPseudo() && feat_iter->GetPseudo());
208 result.SetOutput_data()
209 .AddField("is_except",
210 feat_iter->IsSetExcept() && feat_iter->GetExcept());
211 }
212
213
214 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)215 CTestTranscript_CdsFlags::RunTest(const CSerialObject& obj,
216 const CSeqTestContext* ctx)
217 {
218 return x_TestAllCdregions(obj, ctx, "cds_flags", s_CdsFlags);
219 }
220
221
s_InframeUpstreamStart(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)222 static void s_InframeUpstreamStart(const CSeq_id& id,
223 const CSeqTestContext* ctx,
224 CFeat_CI feat_iter,
225 CSeq_test_result& result)
226 {
227 TSeqPos upstream_length;
228 CSeqVector vec =
229 s_GetCdregionPlusUpstream(feat_iter, ctx, upstream_length);
230 vec.SetIupacCoding();
231
232 EKozakStrength strength;
233 EKozakStrength best_strength = eNone;
234 TSeqPos pos_nearest_best_start = 0; // initialize to avoid compiler warning
235 for (int i = upstream_length - 3; i >= 0; i -= 3) {
236 if (vec[i] == 'A' && vec[i + 1] == 'T' && vec[i + 2] == 'G') {
237 strength = s_GetKozakStrength(vec, i);
238 if (strength > best_strength) {
239 best_strength = strength;
240 pos_nearest_best_start = i;
241 }
242 }
243 }
244 result.SetOutput_data()
245 .AddField("inframe_upstream_start_exists", best_strength != eNone);
246 if (best_strength != eNone) {
247 result.SetOutput_data()
248 .AddField("inframe_upstream_start_best_kozak_strength",
249 s_KozakStrengthToString(best_strength));
250 result.SetOutput_data()
251 .AddField("nearest_best_upstream_start_distance",
252 int(upstream_length - pos_nearest_best_start - 3));
253 }
254 }
255
256
257 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)258 CTestTranscript_InframeUpstreamStart::RunTest(const CSerialObject& obj,
259 const CSeqTestContext* ctx)
260 {
261 return x_TestAllCdregions(obj, ctx, "inframe_upstream_start",
262 s_InframeUpstreamStart);
263 }
264
265
s_InframeUpstreamStop(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)266 static void s_InframeUpstreamStop(const CSeq_id& id,
267 const CSeqTestContext* ctx,
268 CFeat_CI feat_iter, CSeq_test_result& result)
269 {
270 CConstRef<CGenetic_code> code =
271 s_GetCode(feat_iter->GetData().GetCdregion());
272 const CTrans_table& tbl = CGen_code_table::GetTransTable(*code);
273
274 TSeqPos upstream_length;
275 CSeqVector vec =
276 s_GetCdregionPlusUpstream(feat_iter, ctx, upstream_length);
277 vec.SetIupacCoding();
278
279 for (int i = upstream_length - 3; i >= 0; i -= 3) {
280 if (tbl.IsOrfStop(tbl.SetCodonState(vec[i], vec[i + 1],
281 vec[i + 2]))) {
282 result.SetOutput_data()
283 .AddField("inframe_upstream_stop_exists",
284 true);
285 result.SetOutput_data()
286 .AddField("nearest_inframe_upstream_stop_distance",
287 int(upstream_length - i - 3));
288 return;
289 }
290 }
291 result.SetOutput_data()
292 .AddField("inframe_upstream_stop_exists",
293 false);
294 }
295
296
297 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)298 CTestTranscript_InframeUpstreamStop::RunTest(const CSerialObject& obj,
299 const CSeqTestContext* ctx)
300 {
301 return x_TestAllCdregions(obj, ctx, "inframe_upstream_stop",
302 s_InframeUpstreamStop);
303 }
304
305
s_CodingPropensity(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)306 static void s_CodingPropensity(const CSeq_id& id, const CSeqTestContext* ctx,
307 CFeat_CI feat_iter, CSeq_test_result& result)
308 {
309 //creating CHMMParameters object from file is expensive, so
310 //created objects are cached in a static map.
311 //It is reasonable to expect that the number of parameter files
312 //per program is small, and so the cache size does not need
313 //to be limited.
314
315 static std::map<string, CRef<CHMMParameters> > s_hmmparams_cache;
316 DEFINE_STATIC_FAST_MUTEX(map_mutex);
317
318 CRef<CHMMParameters> hmm_params;
319
320 if (!ctx->HasKey("gnomon_model_file")) {
321 return;
322 }
323
324
325 string model_file_name = (*ctx)["gnomon_model_file"];
326
327 {{
328 CFastMutexGuard guard(map_mutex); //released when goes out of scope 4 lines below
329 if(s_hmmparams_cache.find(model_file_name) == s_hmmparams_cache.end()) {
330 CNcbiIfstream model_file(model_file_name.c_str());
331 s_hmmparams_cache[model_file_name] = CRef<CHMMParameters>(new CHMMParameters(model_file));
332 }
333 }}
334
335 hmm_params = s_hmmparams_cache[model_file_name];
336
337
338 const CSeq_loc& cds = feat_iter->GetLocation();
339
340 int gccontent=0;
341 double score = CCodingPropensity::GetScore(hmm_params, cds,
342 ctx->GetScope(), &gccontent);
343
344 // Record results
345 result.SetOutput_data()
346 .AddField("model_file", model_file_name);
347 result.SetOutput_data()
348 .AddField("model_percent_gc", gccontent);
349 result.SetOutput_data()
350 .AddField("score", max(score, -1e100));
351
352 }
353
354
355 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)356 CTestTranscript_CodingPropensity::RunTest(const CSerialObject& obj,
357 const CSeqTestContext* ctx)
358 {
359 return x_TestAllCdregions(obj, ctx, "coding_propensity",
360 s_CodingPropensity);
361 }
362
363
364 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)365 CTestTranscript_TranscriptLength::RunTest(const CSerialObject& obj,
366 const CSeqTestContext* ctx)
367 {
368 CRef<CSeq_test_result_set> ref;
369 const CSeq_id* id = dynamic_cast<const CSeq_id*>(&obj);
370 if ( !id || !ctx ) {
371 return ref;
372 }
373
374 ref.Reset(new CSeq_test_result_set());
375
376 CRef<CSeq_test_result> result = x_SkeletalTestResult("transcript_length");
377 ref->Set().push_back(result);
378
379 int len = ctx->GetScope()
380 .GetBioseqHandle(dynamic_cast<const CSeq_id&>(obj)).GetInst_Length();
381 result->SetOutput_data()
382 .AddField("length", len);
383 return ref;
384 }
385
386
s_CdsLength(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)387 static void s_CdsLength(const CSeq_id& id, const CSeqTestContext* ctx,
388 CFeat_CI feat_iter, CSeq_test_result& result)
389 {
390 result.SetOutput_data()
391 .AddField("length",
392 (int)sequence::GetLength(feat_iter->GetLocation(), 0));
393 }
394
395
396 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)397 CTestTranscript_TranscriptCdsLength::RunTest(const CSerialObject& obj,
398 const CSeqTestContext* ctx)
399 {
400 return x_TestAllCdregions(obj, ctx, "cds_length", s_CdsLength);
401 }
402
403
s_Utrs(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)404 static void s_Utrs(const CSeq_id& id, const CSeqTestContext* ctx,
405 CFeat_CI feat_iter, CSeq_test_result& result)
406 {
407 const CSeq_loc& loc = feat_iter->GetLocation();
408 TSeqPos cds_from = sequence::GetStart(loc, 0);
409 TSeqPos cds_to = sequence::GetStop(loc, 0);
410 int xcript_len = ctx->GetScope().GetBioseqHandle(id).GetInst_Length();
411 result.SetOutput_data().AddField("length_5_prime_utr", (int) cds_from);
412 result.SetOutput_data().AddField("length_3_prime_utr",
413 (int) (xcript_len - cds_to - 1));
414 }
415
416
417 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)418 CTestTranscript_Utrs::RunTest(const CSerialObject& obj,
419 const CSeqTestContext* ctx)
420 {
421 return x_TestAllCdregions(obj, ctx, "utrs", s_Utrs);
422 }
423
424
s_CdsStartCodon(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)425 static void s_CdsStartCodon(const CSeq_id& id, const CSeqTestContext* ctx,
426 CFeat_CI feat_iter, CSeq_test_result& result)
427 {
428 CConstRef<CGenetic_code> code =
429 s_GetCode(feat_iter->GetData().GetCdregion());
430 const CTrans_table& tbl = CGen_code_table::GetTransTable(*code);
431
432 TSeqPos upstream_length;
433 CSeqVector vec =
434 s_GetCdregionPlusUpstream(feat_iter, ctx, upstream_length);
435 vec.SetIupacCoding();
436
437 string seq;
438 vec.GetSeqData(upstream_length, upstream_length + 3, seq);
439 // is this an officially sanctioned start?
440 result.SetOutput_data()
441 .AddField("is_start",
442 tbl.IsOrfStart(tbl.SetCodonState(seq[0], seq[1], seq[2])));
443 // record it, whatever it is
444 result.SetOutput_data()
445 .AddField("first_codon", seq);
446
447 result.SetOutput_data()
448 .AddField("kozak_strength",
449 s_KozakStrengthToString(s_GetKozakStrength(vec, upstream_length)));
450 }
451
452
453 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)454 CTestTranscript_CdsStartCodon::RunTest(const CSerialObject& obj,
455 const CSeqTestContext* ctx)
456 {
457 return x_TestAllCdregions(obj, ctx, "cds_start_codon", s_CdsStartCodon);
458 }
459
460
s_CdsStopCodon(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)461 static void s_CdsStopCodon(const CSeq_id& id, const CSeqTestContext* ctx,
462 CFeat_CI feat_iter, CSeq_test_result& result)
463 {
464 CConstRef<CGenetic_code> code =
465 s_GetCode(feat_iter->GetData().GetCdregion());
466 const CTrans_table& tbl = CGen_code_table::GetTransTable(*code);
467
468 CSeqVector vec(feat_iter->GetLocation(), ctx->GetScope());
469 vec.SetIupacCoding();
470 string seq;
471 vec.GetSeqData(vec.size() - 3, vec.size(), seq);
472 result.SetOutput_data()
473 .AddField("is_stop",
474 tbl.IsOrfStop(tbl.SetCodonState(seq[0], seq[1], seq[2])));
475 }
476
477
478 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)479 CTestTranscript_CdsStopCodon::RunTest(const CSerialObject& obj,
480 const CSeqTestContext* ctx)
481 {
482 return x_TestAllCdregions(obj, ctx, "cds_stop_codon", s_CdsStopCodon);
483 }
484
485
486 // Determine the position in a cds of the start of a Code-break
CodeBreakPosInCds(const CCode_break & code_break,const CSeq_feat & feat,CScope & scope)487 inline TSeqPos CodeBreakPosInCds(const CCode_break& code_break,
488 const CSeq_feat& feat, CScope& scope)
489 {
490 return sequence::LocationOffset(feat.GetLocation(), code_break.GetLoc(),
491 sequence::eOffset_FromStart, &scope);
492 }
493
494
495 // Determine whether a Code-break is a selenocysteine
s_IsSelenocysteine(const CCode_break & code_break)496 static bool s_IsSelenocysteine(const CCode_break& code_break)
497 {
498 switch (code_break.GetAa().Which()) {
499 case CCode_break::TAa::e_Ncbieaa:
500 return code_break.GetAa().GetNcbieaa() == 85;
501 case CCode_break::TAa::e_Ncbi8aa:
502 return code_break.GetAa().GetNcbi8aa() == 24;
503 case CCode_break::TAa::e_Ncbistdaa:
504 return code_break.GetAa().GetNcbistdaa() == 24;
505 case CCode_break::TAa::e_not_set:
506 default:
507 return false;
508 }
509 }
510
511
512 // Determine whether a position in a CDS feature is the beginning
513 // of a selenocysteine codon (according to Code-break's)
s_IsSelenocysteine(TSeqPos pos_in_cds,CFeat_CI feat_iter,CScope & scope)514 static bool s_IsSelenocysteine(TSeqPos pos_in_cds, CFeat_CI feat_iter, CScope& scope)
515 {
516 const CSeq_feat& feat = feat_iter->GetOriginalFeature();
517 if (!feat.GetData().GetCdregion().IsSetCode_break()) {
518 return false;
519 }
520 ITERATE (CCdregion::TCode_break, code_break,
521 feat.GetData().GetCdregion().GetCode_break ()) {
522 if (CodeBreakPosInCds(**code_break, feat, scope) == pos_in_cds
523 && s_IsSelenocysteine(**code_break)) {
524 return true;
525 }
526 }
527 return false;
528 }
529
530
s_PrematureStopCodon(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)531 static void s_PrematureStopCodon(const CSeq_id& id, const CSeqTestContext* ctx,
532 CFeat_CI feat_iter, CSeq_test_result& result)
533 {
534 CConstRef<CGenetic_code> code =
535 s_GetCode(feat_iter->GetData().GetCdregion());
536 const CTrans_table& tbl = CGen_code_table::GetTransTable(*code);
537
538 CSeqVector vec(feat_iter->GetLocation(), ctx->GetScope());
539 vec.SetIupacCoding();
540
541 TSeqPos start_translating;
542 switch (feat_iter->GetData().GetCdregion().GetFrame()) {
543 case CCdregion::eFrame_not_set:
544 case CCdregion::eFrame_one:
545 start_translating = 0;
546 break;
547 case CCdregion::eFrame_two:
548 start_translating = 1;
549 break;
550 case CCdregion::eFrame_three:
551 start_translating = 2;
552 break;
553 default:
554 // should never happen, but handle it to avoid compiler warning
555 start_translating = kInvalidSeqPos;
556 break;
557 }
558
559 bool premature_stop_found = false;
560 for (TSeqPos i = start_translating; i < vec.size() - 3; i += 3) {
561 if (tbl.IsOrfStop(tbl.SetCodonState(vec[i], vec[i + 1],
562 vec[i + 2]))) {
563 if (!premature_stop_found) {
564 result.SetOutput_data()
565 .AddField("has_premature_stop_codon", true);
566 result.SetOutput_data()
567 .AddField("first_premature_stop_position",
568 static_cast<int>(i));
569 premature_stop_found = true;
570 }
571 // determine whether it's an annotated selenocysteine
572 if (!s_IsSelenocysteine(i, feat_iter, ctx->GetScope())) {
573 result.SetOutput_data()
574 .AddField("has_premature_stop_codon_not_sec", true);
575 result.SetOutput_data()
576 .AddField("first_premature_stop_position_not_sec",
577 static_cast<int>(i));
578 return;
579 }
580 }
581 }
582
583 result.SetOutput_data()
584 .AddField("has_premature_stop_codon_not_sec", false);
585 if (!premature_stop_found) {
586 result.SetOutput_data()
587 .AddField("has_premature_stop_codon", false);
588 }
589 }
590
591
592 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)593 CTestTranscript_PrematureStopCodon::RunTest(const CSerialObject& obj,
594 const CSeqTestContext* ctx)
595 {
596 return x_TestAllCdregions(obj, ctx, "premature_stop_codon",
597 s_PrematureStopCodon);
598 }
599
600
601 // Walk the replace history to find the latest revision of a sequence
s_FindLatest(const CSeq_id & id,CScope & scope)602 static CConstRef<CSeq_id> s_FindLatest(const CSeq_id& id, CScope& scope)
603 {
604 CConstRef<CSeq_id> latest = sequence::FindLatestSequence(id, scope);
605 if ( !latest ) {
606 latest.Reset(&id);
607 }
608 return latest;
609 }
610
611
s_CompareProtProdToTrans(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)612 static void s_CompareProtProdToTrans(const CSeq_id& id,
613 const CSeqTestContext* ctx,
614 CFeat_CI feat_iter,
615 CSeq_test_result& result)
616 {
617 string translation;
618 CSeqTranslator::Translate(feat_iter->GetOriginalFeature(), ctx->GetScope(),
619 translation, false /* include_stop */);
620 result.SetOutput_data().AddField("length_translation",
621 int(translation.size()));
622
623 if (!feat_iter->GetOriginalFeature().CanGetProduct()) {
624 // can't do comparison if there's no product annotated
625 return;
626 }
627
628 const CSeq_loc& prod_loc = feat_iter->GetOriginalFeature().GetProduct();
629 const CSeq_id& prod_id = sequence::GetId(prod_loc, 0);
630 CSeqVector prod_vec(prod_loc, ctx->GetScope());
631 prod_vec.SetIupacCoding();
632
633 TSeqPos ident_count = 0;
634 for (TSeqPos i = 0;
635 i < min(prod_vec.size(), (TSeqPos)translation.size()); ++i) {
636 if (prod_vec[i] == translation[i]) {
637 ++ident_count;
638 }
639 }
640
641 result.SetOutput_data().AddField("length_annotated_prot_prod",
642 int(prod_vec.size()));
643 result.SetOutput_data()
644 .AddField("fraction_identity",
645 double(ident_count)
646 / max(prod_vec.size(), (TSeqPos)translation.size()));
647
648 CConstRef<CSeq_id> updated_id = s_FindLatest(prod_id, ctx->GetScope());
649 if (updated_id->Equals(prod_id)) {
650 result.SetOutput_data()
651 .AddField("fraction_identity_updated_prot_prod",
652 double(ident_count)
653 / max(prod_vec.size(), (TSeqPos)translation.size()));
654 result.SetOutput_data().AddField("length_updated_prot_prod",
655 int(prod_vec.size()));
656 } else {
657 CBioseq_Handle updated_prod_hand
658 = ctx->GetScope().GetBioseqHandle(*updated_id);
659 CSeqVector updated_prod_vec = updated_prod_hand.GetSeqVector();
660 updated_prod_vec.SetIupacCoding();
661 TSeqPos ident_count = 0;
662 for (TSeqPos i = 0;
663 i < min(updated_prod_vec.size(), (TSeqPos)translation.size());
664 ++i) {
665 if (updated_prod_vec[i] == translation[i]) {
666 ++ident_count;
667 }
668 }
669 result.SetOutput_data()
670 .AddField("fraction_identity_updated_prot_prod",
671 double(ident_count)
672 / max(updated_prod_vec.size(),
673 (TSeqPos)translation.size()));
674 result.SetOutput_data().AddField("length_updated_prot_prod",
675 int(updated_prod_vec.size()));
676 }
677 result.SetOutput_data()
678 .AddField("prot_prod_updated", !updated_id->Equals(prod_id));
679 result.SetOutput_data()
680 .AddField("updated_prod_id", updated_id->AsFastaString());
681 }
682
683
684 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)685 CTestTranscript_CompareProtProdToTrans::RunTest(const CSerialObject& obj,
686 const CSeqTestContext* ctx)
687 {
688 return x_TestAllCdregions(obj, ctx, "compare_prot_prod_to_trans",
689 s_CompareProtProdToTrans);
690 }
691
692
693 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)694 CTestTranscript_PolyA::RunTest(const CSerialObject& obj,
695 const CSeqTestContext* ctx)
696 {
697 CRef<CSeq_test_result_set> ref;
698 const CSeq_id* id = dynamic_cast<const CSeq_id*>(&obj);
699 if ( !id || !ctx ) {
700 return ref;
701 }
702
703 ref.Reset(new CSeq_test_result_set());
704
705 CRef<CSeq_test_result> result = x_SkeletalTestResult("poly_a");
706 ref->Set().push_back(result);
707
708 CBioseq_Handle xcript_hand = ctx->GetScope().GetBioseqHandle(*id);
709 CSeqVector vec = xcript_hand.GetSeqVector();
710 vec.SetIupacCoding();
711
712 //compute trailing a-count
713 {{
714 int pos(0);
715 for(pos = vec.size() - 1; pos > 0; --pos) {
716 if (vec[pos] != 'A') {
717 break;
718 }
719 }
720 result->SetOutput_data().AddField("trailing_a_count",
721 int(vec.size() - pos - 1));
722 }}
723
724
725 int tail_length(0);
726 //compute tail length allowing for mismatches.
727 //Note: there's similar logic for computing genomic polya priming in alignment tests
728 {{
729 static const int w_match = 1;
730 static const int w_mismatch = -4;
731 static const int x_dropoff = 15;
732
733 size_t best_pos = NPOS;
734 int best_score = 0;
735 int curr_score = 0;
736
737 for(size_t curr_pos = vec.size() - 1;
738 curr_pos > 0 && curr_score + x_dropoff > best_score;
739 --curr_pos)
740 {
741 curr_score += vec[curr_pos] == 'A' ? w_match : w_mismatch;
742 if(curr_score >= best_score) {
743 best_score = curr_score;
744 best_pos = curr_pos;
745 }
746 }
747 tail_length = (best_pos == NPOS) ? 0 : vec.size() - best_pos;
748 result->SetOutput_data().AddField("tail_length", tail_length);
749 }}
750
751
752 //find signal
753 {{
754 static string patterns[] = {
755 "AATAAA",
756 "ATTAAA",
757 "AGTAAA",
758 "TATAAA",
759 "CATAAA",
760 "GATAAA",
761 "AATATA",
762 "AATACA",
763 "AATAGA",
764 "ACTAAA",
765 "AAGAAA",
766 "AATGAA"
767 };
768
769 size_t window = 50; //serch within 50 bases upstream of polya-site
770 size_t end_pos = vec.size() - 1 - tail_length;
771 size_t begin_pos = end_pos > window ? end_pos - window : 0;
772
773 string seq;
774 vec.GetSeqData(begin_pos, end_pos, seq);
775
776 for(int ii = 0; ii < 12; ii++) {
777 size_t pos = NStr::Find(seq, patterns[ii], NStr::eCase, NStr::eReverseSearch);
778 if(pos != NPOS) {
779 result->SetOutput_data().AddField("signal_pos", static_cast<int>(pos + begin_pos));
780 result->SetOutput_data().AddField("is_canonical_pas", (ii <= 1)); //AATAAA or ATTAAA
781 break;
782 }
783 }
784 }}
785
786 return ref;
787 }
788
789
TestStrongKozakUorfs(const CBioseq_Handle bsh,CSeq_test_result & result)790 void TestStrongKozakUorfs(const CBioseq_Handle bsh, CSeq_test_result& result)
791 {
792 TSeqPos cds_start(kInvalidSeqPos);
793 for(CFeat_CI ci(bsh, SAnnotSelector(CSeqFeatData::e_Cdregion)); ci; ++ci) {
794 cds_start = ci->GetLocation().GetStart(eExtreme_Positional);
795 break;
796 }
797 if(cds_start == kInvalidSeqPos) {
798 return;
799 }
800
801 COrf::TLocVec overlapping_uorfs, upstream_uorfs;
802 COrf::FindStrongKozakUOrfs(bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac), cds_start, overlapping_uorfs, upstream_uorfs);
803 result.SetOutput_data().AddField("overlapping_strong_uorfs", (int)overlapping_uorfs.size());
804 result.SetOutput_data().AddField("upstream_strong_uorfs", (int)upstream_uorfs.size());
805 }
806
807 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)808 CTestTranscript_Orfs::RunTest(const CSerialObject& obj,
809 const CSeqTestContext* ctx)
810 {
811 CRef<CSeq_test_result_set> ref;
812 const CSeq_id* id = dynamic_cast<const CSeq_id*>(&obj);
813 if ( !id || !ctx ) {
814 return ref;
815 }
816
817 ref.Reset(new CSeq_test_result_set());
818
819 CRef<CSeq_test_result> result = x_SkeletalTestResult("orfs");
820 ref->Set().push_back(result);
821
822 CBioseq_Handle xcript_hand = ctx->GetScope().GetBioseqHandle(*id);
823 CSeqVector vec = xcript_hand.GetSeqVector();
824 vec.SetIupacCoding();
825
826 // Look for ORFs starting with any sense codon
827 vector<CRef<CSeq_loc> > orfs;
828 COrf::FindOrfs(vec, orfs);
829 TSeqPos max_orf_length_forward = 0;
830 TSeqPos max_orf_length_either = 0;
831 TSeqPos largest_forward_orf_end = 0; // intialized to avoid comp. warning
832 ITERATE (vector<CRef<CSeq_loc> >, orf, orfs) {
833 TSeqPos orf_length = sequence::GetLength(**orf, 0);
834 max_orf_length_either = max(max_orf_length_either, orf_length);
835 if ((*orf)->GetInt().GetStrand() != eNa_strand_minus) {
836 if (orf_length > max_orf_length_forward) {
837 max_orf_length_forward = orf_length;
838 largest_forward_orf_end = (*orf)->GetInt().GetTo();
839 }
840 max_orf_length_forward = max(max_orf_length_forward, orf_length);
841 }
842 }
843
844 result->SetOutput_data().AddField("max_orf_length_forward_strand",
845 int(max_orf_length_forward));
846 result->SetOutput_data().AddField("largest_forward_orf_end_pos",
847 int(largest_forward_orf_end));
848 result->SetOutput_data().AddField("max_orf_length_either_strand",
849 int(max_orf_length_either));
850
851 // Look for ORFs starting with ATG
852 orfs.clear();
853 vector<string> allowable_starts;
854 allowable_starts.push_back("ATG");
855 COrf::FindOrfs(vec, orfs, 3, 1, allowable_starts);
856 max_orf_length_forward = 0;
857 max_orf_length_either = 0;
858 ITERATE (vector<CRef<CSeq_loc> >, orf, orfs) {
859 TSeqPos orf_length = sequence::GetLength(**orf, 0);
860 max_orf_length_either = max(max_orf_length_either, orf_length);
861 if ((*orf)->GetInt().GetStrand() != eNa_strand_minus) {
862 if (orf_length > max_orf_length_forward) {
863 max_orf_length_forward = orf_length;
864 largest_forward_orf_end = (*orf)->GetInt().GetTo();
865 }
866 max_orf_length_forward = max(max_orf_length_forward, orf_length);
867 }
868 }
869
870 result->SetOutput_data().AddField("max_atg_orf_length_forward_strand",
871 int(max_orf_length_forward));
872 result->SetOutput_data().AddField("largest_forward_atg_orf_end_pos",
873 int(largest_forward_orf_end));
874 result->SetOutput_data().AddField("max_atg_orf_length_either_strand",
875 int(max_orf_length_either));
876
877 TestStrongKozakUorfs(xcript_hand, *result);
878
879 return ref;
880 }
881
882
s_Code_break(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)883 static void s_Code_break(const CSeq_id& id, const CSeqTestContext* ctx,
884 CFeat_CI feat_iter, CSeq_test_result& result)
885 {
886 int count, not_start_not_sec_count;
887 if (feat_iter->GetData().GetCdregion().IsSetCode_break()) {
888 count = feat_iter->GetData().GetCdregion().GetCode_break().size();
889 not_start_not_sec_count = 0;
890 ITERATE (CCdregion::TCode_break, code_break,
891 feat_iter->GetData().GetCdregion().GetCode_break()) {
892 TSeqPos pos = CodeBreakPosInCds(**code_break,
893 feat_iter->GetOriginalFeature(),
894 ctx->GetScope());
895 if (pos != 0 && !s_IsSelenocysteine(**code_break)) {
896 ++not_start_not_sec_count;
897 }
898 }
899 } else {
900 count = 0;
901 not_start_not_sec_count = 0;
902 }
903
904 result.SetOutput_data()
905 .AddField("code_break_count", count);
906 result.SetOutput_data()
907 .AddField("code_break_not_start_not_sec_count",
908 not_start_not_sec_count);
909 }
910
911
912 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)913 CTestTranscript_Code_break::RunTest(const CSerialObject& obj,
914 const CSeqTestContext* ctx)
915 {
916 return x_TestAllCdregions(obj, ctx, "code_break",
917 s_Code_break);
918 }
919
920
s_OrfExtension(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)921 static void s_OrfExtension(const CSeq_id& id,
922 const CSeqTestContext* ctx,
923 CFeat_CI feat_iter,
924 CSeq_test_result& result)
925 {
926 TSeqPos upstream_length;
927 CSeqVector vec =
928 s_GetCdregionPlusUpstream(feat_iter, ctx, upstream_length);
929 vec.SetIupacCoding();
930
931 EKozakStrength strength;
932 vector<int> starts(eStrong + 1, upstream_length);
933 string codon;
934 for (int i = upstream_length - 3; i >= 0; i -= 3) {
935 vec.GetSeqData(i, i + 3, codon);
936 if (codon == "ATG") {
937 strength = s_GetKozakStrength(vec, i);
938 starts[strength] = i;
939 }
940 if (codon == "TAA" || codon == "TAG" || codon == "TGA") {
941 break;
942 }
943 }
944
945 // MSS-59
946 // Count the total number of 'ATG' triplets found in the 5'UTR of a
947 // transcript, in frames 1, 2, and 3.
948 int upstream_utr_atg_count(0);
949 for (int i = upstream_length - 3; i >= 0; i -= 1) {
950 vec.GetSeqData(i, i + 3, codon);
951 if (codon == "ATG") {
952 upstream_utr_atg_count++;
953 }
954 }
955
956 result.SetOutput_data()
957 .AddField("max_extension_weak_kozak",
958 static_cast<int>(upstream_length - starts[eWeak]));
959 result.SetOutput_data()
960 .AddField("max_extension_moderate_kozak",
961 static_cast<int>(upstream_length - starts[eModerate]));
962 result.SetOutput_data()
963 .AddField("max_extension_strong_kozak",
964 static_cast<int>(upstream_length - starts[eStrong]));
965 result.SetOutput_data()
966 .AddField("upstream_utr_atg_count",
967 upstream_utr_atg_count);
968 }
969
970
971 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)972 CTestTranscript_OrfExtension::RunTest(const CSerialObject& obj,
973 const CSeqTestContext* ctx)
974 {
975 return x_TestAllCdregions(obj, ctx, "orf_extension",
976 s_OrfExtension);
977 }
978
979
s_CountAmbiguities(const CSeqVector & vec)980 static TSeqPos s_CountAmbiguities(const CSeqVector& vec)
981 {
982 CSeqVector vec_copy(vec);
983 vec_copy.SetIupacCoding();
984 string seq;
985 vec_copy.GetSeqData(0, vec_copy.size(), seq);
986
987 CSeq_data in_seq, out_seq;
988 in_seq.SetIupacna().Set(seq);
989 vector<TSeqPos> out_indices;
990
991 return CSeqportUtil::GetAmbigs(in_seq, &out_seq, &out_indices);
992 }
993
994
s_CdsCountAmbiguities(const CSeq_id & id,const CSeqTestContext * ctx,CFeat_CI feat_iter,CSeq_test_result & result)995 static void s_CdsCountAmbiguities(const CSeq_id& id,
996 const CSeqTestContext* ctx,
997 CFeat_CI feat_iter, CSeq_test_result& result)
998 {
999 CSeqVector vec(feat_iter->GetLocation(), ctx->GetScope());
1000 result.SetOutput_data()
1001 .AddField("cds_ambiguity_count",
1002 static_cast<int>(s_CountAmbiguities(vec)));
1003 }
1004
1005
1006 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)1007 CTestTranscript_CountAmbiguities::RunTest(const CSerialObject& obj,
1008 const CSeqTestContext* ctx)
1009 {
1010 CRef<CSeq_test_result_set> rv;
1011 const CSeq_id* id = dynamic_cast<const CSeq_id*>(&obj);
1012 if ( !id || !ctx ) {
1013 return rv;
1014 }
1015
1016 // count for each coding region
1017 rv = x_TestAllCdregions(obj, ctx, "count_ambiguities",
1018 s_CdsCountAmbiguities);
1019
1020 // count for entire transcript
1021 if (!rv) {
1022 rv.Reset(new CSeq_test_result_set());
1023 }
1024 CBioseq_Handle hand = ctx->GetScope().GetBioseqHandle(*id);
1025 CSeqVector vec = hand.GetSeqVector();
1026 CRef<CSeq_test_result> result = x_SkeletalTestResult("count_ambiguities");
1027 rv->Set().push_back(result);
1028 result->SetOutput_data()
1029 .AddField("ambiguity_count",
1030 static_cast<int>(s_CountAmbiguities(vec)));
1031 return rv;
1032 }
1033
1034
1035 END_NCBI_SCOPE
1036