1 /* $Id: single_aln_tests.cpp 353529 2012-02-16 18:22:04Z astashya $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Josh Cherry
27 *
28 * File Description: Tests operating on an alignment of a transcript to
29 * genomic DNA
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <algo/seqqa/single_aln_tests.hpp>
35 #include <objects/seqalign/Seq_align.hpp>
36 #include <objects/seqloc/Seq_interval.hpp>
37 #include <objects/seq/Seq_data.hpp>
38 #include <objects/seq/IUPACna.hpp>
39 #include <objtools/alnmgr/alnvec.hpp>
40 #include <objects/seqalign/Seq_align_set.hpp>
41 #include <objects/seqloc/Seq_id.hpp>
42 #include <objects/seqalign/Seq_align.hpp>
43 #include <objects/seqloc/Seq_loc.hpp>
44 #include <objects/seqalign/Dense_seg.hpp>
45 #include <objects/seqalign/Spliced_exon.hpp>
46 #include <objects/seqalign/Spliced_seg.hpp>
47 #include <objects/seqalign/Spliced_exon_chunk.hpp>
48 #include <objects/seqalign/Product_pos.hpp>
49 #include <objects/seqfeat/Seq_feat.hpp>
50 #include <objects/seqfeat/Cdregion.hpp>
51 #include <objects/seqfeat/Code_break.hpp>
52 #include <objects/general/Object_id.hpp>
53 #include <objects/general/User_object.hpp>
54 #include <objects/seq/seqport_util.hpp>
55 #include <objmgr/seq_vector.hpp>
56 #include <objmgr/bioseq_handle.hpp>
57 #include <objmgr/annot_selector.hpp>
58 #include <objmgr/feat_ci.hpp>
59 #include <objmgr/util/sequence.hpp>
60 #include <objmgr/seq_loc_mapper.hpp>
61 #include <algo/sequence/consensus_splice.hpp>
62
63 BEGIN_NCBI_SCOPE
64 USING_SCOPE(objects);
65
CanTest(const CSerialObject & obj,const CSeqTestContext * ctx) const66 bool CTestSingleAln::CanTest(const CSerialObject& obj,
67 const CSeqTestContext* ctx) const
68 {
69 const CSeq_align* aln = dynamic_cast<const CSeq_align*>(&obj);
70 if (aln) {
71 // Can analyze a disc alignment...
72 if (aln->GetType() == ncbi::CSeq_align::eType_disc) {
73 return true;
74 }
75 // ...or a Spliced-seg where product is a transcript
76 if (aln->GetSegs().IsSpliced()) {
77 if (aln->GetSegs().GetSpliced().GetProduct_type()
78 == CSpliced_seg::eProduct_type_transcript) {
79 return true;
80 }
81 }
82 }
83 return false;
84 }
85
86
87 // Like CSeq_align::GetSeqStrand, but if the strand is not set,
88 // returns eNa_strand_plus rather than throwing an exception
s_GetSeqStrand(const CSeq_align & aln,CSeq_align::TDim row)89 static ENa_strand s_GetSeqStrand(const CSeq_align& aln, CSeq_align::TDim row)
90 {
91 try {
92 return aln.GetSeqStrand(row);
93 } catch(CSeqalignException&) {
94 return eNa_strand_plus;
95 }
96 }
97
98
99 static CRef<CSeq_align> s_SplicedToDisc(const CSeq_align& spliced_seg_aln);
100
101
102 //if neighborhood < 0 - search upstream of cleavage; otherwise downstream.
s_GetPolyA_genomic_priming(const CSeq_align & aln,CScope & scope,int neighborhood)103 static size_t s_GetPolyA_genomic_priming(const CSeq_align& aln, CScope& scope, int neighborhood)
104 {
105 CBioseq_Handle bsh = scope.GetBioseqHandle(aln.GetSeq_id(1));
106
107 CRef<CSeq_loc> loc(new CSeq_loc); //upstream or downstream neighborhood loc
108 {{
109 ENa_strand strand = s_GetSeqStrand(aln, 1);
110
111 //cleavage position
112 TSeqPos pos = strand == eNa_strand_minus ? aln.GetSeqStart(1) : aln.GetSeqStop(1);
113
114 //if searching downstream of cleavage, start with first pos past end of alignment
115 if(neighborhood > 0 && pos > 0 && pos < bsh.GetInst_Length() - 1) {
116 pos += strand == eNa_strand_minus ? -1 : +1;
117 }
118
119 loc->SetInt().SetId().Assign(aln.GetSeq_id(1));
120 loc->SetInt().SetFrom(pos);
121 loc->SetInt().SetTo(pos);
122 loc->SetInt().SetStrand(strand);
123
124 if( (neighborhood >= 0 && strand != eNa_strand_minus)
125 || (neighborhood < 0 && strand == eNa_strand_minus))
126 {
127 loc->SetInt().SetTo(min(bsh.GetInst_Length() - 1, pos + abs(neighborhood)));
128 } else {
129 loc->SetInt().SetFrom(pos < abs(neighborhood) ? 0 : pos - abs(neighborhood));
130 }
131 }}
132
133 CSeqVector vec(*loc, scope, CBioseq_Handle::eCoding_Iupac);
134 string seq("");
135
136 vec.GetSeqData(vec.begin(), vec.end(), seq);
137 if(neighborhood < 0) {
138 reverse(seq.begin(), seq.end());
139 }
140
141 size_t best_pos(NPOS);
142 {{
143 static const int w_match = 1;
144 static const int w_mismatch = -4;
145 static const int x_dropoff = 15;
146
147 int best_score = 0;
148 int curr_score = 0;
149
150 for(size_t curr_pos = 0;
151 curr_pos < seq.size() && curr_score + x_dropoff > best_score;
152 ++curr_pos)
153 {
154 curr_score += seq[curr_pos] == 'A' ? w_match : w_mismatch;
155 if(curr_score >= best_score) {
156 best_score = curr_score;
157 best_pos = curr_pos;
158 }
159 }
160 }}
161 size_t priming_length = best_pos == NPOS ? 0 : best_pos + 1;
162
163 //string label;
164 //loc->GetLabel(&label);
165 //LOG_POST(aln.GetSeq_id(0).AsFastaString() << "\t" << label << "\t" << neighborhood << "\t" << seq << "\t" << priming_length);
166
167 return priming_length;
168 }
169
170
171
172 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)173 CTestSingleAln_All::RunTest(const CSerialObject& obj,
174 const CSeqTestContext* ctx)
175 {
176 CRef<CSeq_test_result_set> ref;
177 CConstRef<CSeq_align> aln(dynamic_cast<const CSeq_align*>(&obj));
178 if ( !aln || !ctx) {
179 return ref;
180 }
181
182 // Handle a Spliced-seg by converting it to a disc
183 if (aln->GetSegs().IsSpliced()) {
184 aln = s_SplicedToDisc(*aln);
185 } else if(!aln->GetSegs().IsDisc()) {
186 NCBI_THROW(CException, eUnknown, "Expected spliced or disc alignment");
187 }
188
189 ref.Reset(new CSeq_test_result_set());
190
191 CRef<CSeq_test_result> result = x_SkeletalTestResult("singlealn_all");
192 ref->Set().push_back(result);
193
194 CScope& scope = ctx->GetScope();
195
196 const CSeq_align_set::Tdata& disc = aln->GetSegs().GetDisc().Get();
197 result->SetOutput_data()
198 .AddField("exon_count", (int) disc.size());
199
200 CBioseq_Handle xcript_hand
201 = scope.GetBioseqHandle(disc.front()->GetSeq_id(0));
202 SAnnotSelector sel(CSeqFeatData::eSubtype_cdregion);
203 sel.SetResolveDepth(0);
204 CFeat_CI it(xcript_hand, sel);
205 bool has_cds(it);
206 TSeqPos cds_from = 0, cds_to = 0; // initialize to avoid compiler warning
207 CAlnVec::TSignedRange cds_range;
208 const CSeq_id& genomic_id = disc.front()->GetSeq_id(1);
209 CBioseq_Handle genomic_hand = scope.GetBioseqHandle(genomic_id);
210 if (has_cds) {
211 const CSeq_loc& cds_loc = it->GetLocation();
212 cds_from = sequence::GetStart(cds_loc, 0);
213 cds_to = sequence::GetStop(cds_loc, 0);
214 cds_range.SetFrom(static_cast<TSignedSeqPos>(cds_from));
215 cds_range.SetTo(static_cast<TSignedSeqPos>(cds_to));
216
217 // Assess whether differences between transcript and genome
218 // would affect the protein produced
219 CSeq_loc_Mapper mapper(*aln, 1, &scope);
220 CRef<CSeq_loc> genomic_cds_loc = mapper.Map(cds_loc);
221 const CGenetic_code* code = 0;
222 if (it->GetData().GetCdregion().CanGetCode()) {
223 code = &it->GetData().GetCdregion().GetCode();
224 }
225 string xcript_prot, genomic_prot;
226 CSeqTranslator::Translate(cds_loc, scope, xcript_prot, code);
227 CSeqTranslator::Translate(*genomic_cds_loc, scope, genomic_prot, code);
228 // Say that they "can make same prot" iff the translations
229 // are the same AND do not contain ambiguities
230 bool can_make_same_prot = false;
231 if (xcript_prot == genomic_prot &&
232 xcript_prot.find_first_not_of("ACDEFGHIKLMNPQRSTVWY*") == NPOS) {
233 // no need to check genomic (it's equal)
234
235 can_make_same_prot = true; // provisionally true, but...
236
237 // Demand that any code-breaks annotated on transcript
238 // are identical at the nucleotide level in the genomic
239 if (it->GetData().GetCdregion().IsSetCode_break()) {
240 ITERATE (CCdregion::TCode_break, cb,
241 it->GetMappedFeature().GetData().GetCdregion()
242 .GetCode_break()) {
243 const CCode_break& code_break = **cb;
244 const CSeq_loc& xcript_cb_loc = code_break.GetLoc();
245 CRef<CSeq_loc> genomic_cb_loc = mapper.Map(xcript_cb_loc);
246
247 CSeqVector gvec(*genomic_cb_loc, scope);
248 string gseq;
249 gvec.GetSeqData(0, gvec.size(), gseq);
250
251 CSeqVector xvec(xcript_cb_loc, scope);
252 string xseq;
253 xvec.GetSeqData(0, xvec.size(), xseq);
254
255 if (gseq != xseq) {
256 can_make_same_prot = false;
257 break;
258 }
259 }
260 }
261 }
262 result->SetOutput_data()
263 .AddField("can_make_same_prot", can_make_same_prot);
264 }
265
266
267 TSeqPos last_genomic_end = 0;
268 TSeqPos aligned_residue_count = 0;
269 TSeqPos cds_aligned_residue_count = 0;
270 TSeqPos match_count = 0, possible_match_count = 0;
271 TSeqPos cds_match_count = 0, cds_possible_match_count = 0;
272 vector<int> cds_match_count_by_frame(3);
273 vector<int> cds_possible_match_count_by_frame(3);
274 TSeqPos total_splices = 0;
275 TSeqPos consensus_splices = 0;
276 TSeqPos total_cds_splices = 0;
277 TSeqPos consensus_cds_splices = 0;
278
279 int exon_index = 0;
280 TSeqPos min_exon_length = 0, max_exon_length = 0; // initialize to
281 TSeqPos min_intron_length = 0, max_intron_length = 0; // avoid warnings
282 TSeqPos exon_length, intron_length;
283 TSeqPos exon_match_count, exon_possible_match_count;
284 double worst_exon_match_frac = 1.1; // guarantee that something is worse
285 TSeqPos worst_exon_match_count = 0; // initialize to
286 TSeqPos worst_exon_possible_match_count = 0; // avoid
287 TSeqPos worst_exon_length = 0; // compiler warnings
288 int indel_count = 0;
289 int cds_indel_count = 0;
290 TSeqPos exon_length_3p = 0;
291 TSeqPos exon_length_5p = 0;
292
293 // These will be used for counting genomic ambiguities
294 CSeq_data genomic_seq_data;
295 string& genomic_seq = genomic_seq_data.SetIupacna().Set();
296 CSeq_data genomic_cds_seq_data;
297 string& genomic_cds_seq = genomic_cds_seq_data.SetIupacna().Set();
298
299 int lag = 0; // cumulative gaps in xcript minus genomic in cds;
300 // initialize to avoid compiler warnings
301 int frame;
302
303 ITERATE (CSeq_align_set::Tdata, iter, disc) {
304 const CSeq_align& exon = **iter;
305 CRange<TSeqPos> r = exon.GetSeqRange(0);
306 exon_length = r.GetLength();
307
308 if(exon_index == 0) {
309 exon_length_5p = exon_length;
310 }
311 exon_length_3p = exon_length;
312
313 if (exon_index == 0 || exon_length > max_exon_length) {
314 max_exon_length = exon_length;
315 }
316 if (exon_index == 0 || exon_length < min_exon_length) {
317 min_exon_length = exon_length;
318 }
319
320 if (s_GetSeqStrand(exon, 1) == eNa_strand_minus) {
321 intron_length = last_genomic_end - exon.GetSeqStop(1)- 1;
322 } else {
323 intron_length = exon.GetSeqStart(1) - last_genomic_end - 1;
324 }
325
326 if (exon_index > 0) {
327 // intron length
328 if (exon_index == 1 || intron_length > max_intron_length) {
329 max_intron_length = intron_length;
330 }
331 if (exon_index == 1 || intron_length < min_intron_length) {
332 min_intron_length = intron_length;
333 }
334
335 // splice consensus
336 CSeq_loc loc;
337 loc.SetInt().SetId().Assign(exon.GetSeq_id(1));
338 if (s_GetSeqStrand(exon, 1) == eNa_strand_minus) {
339 loc.SetInt().SetFrom(last_genomic_end - 2);
340 loc.SetInt().SetTo (last_genomic_end - 1);
341 loc.SetInt().SetStrand(eNa_strand_minus);
342 } else {
343 loc.SetInt().SetFrom(last_genomic_end + 1);
344 loc.SetInt().SetTo (last_genomic_end + 2);
345 }
346 CSeqVector vec5(loc, scope);
347 vec5.SetIupacCoding();
348 string splice5;
349 vec5.GetSeqData(0, 2, splice5);
350
351 if (s_GetSeqStrand(exon, 1) == eNa_strand_minus) {
352 loc.SetInt().SetFrom(exon.GetSeqStop(1) + 1);
353 loc.SetInt().SetTo (exon.GetSeqStop(1) + 2);
354 loc.SetInt().SetStrand(eNa_strand_minus);
355 } else {
356 loc.SetInt().SetFrom(exon.GetSeqStart(1) - 2);
357 loc.SetInt().SetTo (exon.GetSeqStart(1) - 1);
358 }
359 CSeqVector vec3(loc, scope);
360 vec3.SetIupacCoding();
361 string splice3;
362 vec3.GetSeqData(0, 2, splice3);
363
364 bool consensus_splice = IsConsensusSplice(splice5, splice3);
365 total_splices += 1;
366 if (consensus_splice) {
367 ++consensus_splices;
368 }
369 if (has_cds) {
370 if (cds_from < exon.GetSeqStart(0)
371 && cds_to >= exon.GetSeqStart(0)) {
372 ++total_cds_splices;
373 if (consensus_splice) {
374 ++consensus_cds_splices;
375 }
376 }
377 }
378 } // exon_index > 0
379
380 if (s_GetSeqStrand(exon, 1) == eNa_strand_minus) {
381 last_genomic_end = exon.GetSeqStart(1);
382 } else {
383 last_genomic_end = exon.GetSeqStop(1);
384 }
385
386 if (cds_from >= exon.GetSeqStart(0)
387 && cds_from <= exon.GetSeqStop(0)) {
388 result->SetOutput_data()
389 .AddField("introns_5_prime_of_start", exon_index);
390 }
391 if (cds_to >= exon.GetSeqStart(0)
392 && cds_to <= exon.GetSeqStop(0)) {
393 unsigned int downstream_intron_count =
394 disc.size() - exon_index - 1;
395 result->SetOutput_data()
396 .AddField("introns_3_prime_of_stop",
397 (int) downstream_intron_count);
398 if (downstream_intron_count > 0) {
399 result->SetOutput_data()
400 .AddField("dist_stop_to_exon_end",
401 int(exon.GetSeqStop(0) - cds_to));
402 result->SetOutput_data()
403 .AddField("dist_stop_to_last_intron",
404 int((*++disc.rbegin())->GetSeqStop(0) - cds_to));
405 }
406 }
407
408 // count of transcript residues doing various things
409 CAlnVec avec(exon.GetSegs().GetDenseg(), scope);
410 avec.SetGapChar('-');
411 avec.SetEndChar('-');
412 exon_match_count = 0;
413 exon_possible_match_count = 0;
414
415 // need to be careful here because segment may begin with gap
416 bool in_cds = has_cds
417 && avec.GetSeqStart(0) > cds_from
418 && avec.GetSeqStart(0) <= cds_to;
419
420 for (TSeqPos i = 0; i <= avec.GetAlnStop(); ++i) {
421 bool gap_in_xcript = avec.GetResidue(0, i) == '-';
422 bool gap_in_genomic = avec.GetResidue(1, i) == '-';
423
424 if (!gap_in_xcript) {
425 TSeqPos pos = avec.GetSeqPosFromAlnPos(0, i);
426 if (has_cds) {
427 in_cds = pos >= cds_from && pos <= cds_to;
428 }
429 if (!gap_in_genomic) {
430 ++aligned_residue_count;
431 if (in_cds) {
432 ++cds_aligned_residue_count;
433 }
434 if (avec.GetResidue(0, i) == avec.GetResidue(1, i)) {
435 ++exon_match_count;
436 if (in_cds) {
437 if (cds_match_count == 0
438 && cds_possible_match_count == 0) {
439 // this is first cds match (probably start)
440 lag = 0;
441 }
442 ++cds_match_count;
443 frame = lag % 3;
444 if (frame < 0) {
445 frame += 3;
446 }
447 ++cds_match_count_by_frame[frame];
448 }
449 } else if (!gap_in_xcript) {
450 unsigned char cdna_res =
451 CAlnVec::FromIupac(avec.GetResidue(0, i));
452 unsigned char genomic_res =
453 CAlnVec::FromIupac(avec.GetResidue(1, i));
454 // count as a possible match if
455 // cdna is a subset of genomic
456 if (!(cdna_res & ~genomic_res)) {
457 ++exon_possible_match_count;
458 if (in_cds) {
459 if (cds_match_count == 0
460 && cds_possible_match_count == 0) {
461 // this is first cds match (probably start)
462 lag = 0;
463 }
464 ++cds_possible_match_count;
465 frame = lag % 3;
466 if (frame < 0) {
467 frame += 3;
468 }
469 ++cds_possible_match_count_by_frame[frame];
470 }
471 }
472 }
473 }
474 }
475 if (gap_in_xcript && in_cds) {
476 ++lag;
477 }
478 if (gap_in_genomic && in_cds) {
479 --lag;
480 }
481 if (!gap_in_genomic && in_cds) {
482 genomic_cds_seq.push_back(avec.GetResidue(1, i));
483 }
484 }
485 match_count += exon_match_count;
486 possible_match_count += exon_possible_match_count;
487
488 // look for "worst" exon
489 double exon_match_frac =
490 double(exon_match_count + exon_possible_match_count) / exon_length;
491 if (exon_match_frac < worst_exon_match_frac) {
492 worst_exon_match_frac = exon_match_frac;
493 worst_exon_match_count = exon_match_count;
494 worst_exon_possible_match_count = exon_possible_match_count;
495 worst_exon_length = exon_length;
496 }
497
498 // count indels (count as 1, regardless of length)
499 in_cds = has_cds
500 && avec.GetSeqStart(0) > cds_from
501 && avec.GetSeqStart(0) <= cds_to; // only for mRNA gaps
502
503 for (int seg = 0; seg < avec.GetNumSegs(); ++seg) {
504 if (avec.GetStart(0, seg) == -1) {
505 ++indel_count;
506 if (in_cds) {
507 ++cds_indel_count;
508 }
509 } else {
510 if (avec.GetStart(1, seg) == -1) {
511 ++indel_count;
512 // any overlap of segment with cds counts
513 if (has_cds
514 && cds_range.IntersectingWith(avec.GetRange(0, seg))) {
515 ++cds_indel_count;
516 }
517 }
518 in_cds = has_cds
519 && avec.GetStop(0, seg)
520 >= static_cast<TSignedSeqPos>(cds_from)
521 && avec.GetStop(0, seg)
522 < static_cast<TSignedSeqPos>(cds_to);
523 }
524 }
525
526 string exon_genomic_seq;
527 avec.GetSeqString(exon_genomic_seq, 1,
528 avec.GetSeqStart(1), avec.GetSeqStop(1));
529 genomic_seq += exon_genomic_seq;
530
531 ++exon_index;
532 }
533
534
535 result->SetOutput_data()
536 .AddField("max_exon_length", (int) max_exon_length);
537 result->SetOutput_data()
538 .AddField("min_exon_length", (int) min_exon_length);
539
540 result->SetOutput_data()
541 .AddField("5p_terminal_exon_length", (int) exon_length_5p);
542 result->SetOutput_data()
543 .AddField("3p_terminal_exon_length", (int) exon_length_3p);
544
545 if (disc.size() > 1) {
546 result->SetOutput_data()
547 .AddField("max_intron_length", (int) max_intron_length);
548 result->SetOutput_data()
549 .AddField("min_intron_length", (int) min_intron_length);
550 }
551 result->SetOutput_data()
552 .AddField("aligned_residues", (int) aligned_residue_count);
553 result->SetOutput_data()
554 .AddField("matching_residues", (int) match_count);
555 result->SetOutput_data()
556 .AddField("possibly_matching_residues", (int) possible_match_count);
557 if (has_cds) {
558 result->SetOutput_data()
559 .AddField("cds_matching_residues", (int) cds_match_count);
560 result->SetOutput_data()
561 .AddField("cds_possibly_matching_residues",
562 (int) cds_possible_match_count);
563 result->SetOutput_data()
564 .AddField("cds_aligned_residues", (int) cds_aligned_residue_count);
565 result->SetOutput_data()
566 .AddField("in_frame_cds_matching_residues",
567 (int) cds_match_count_by_frame[0]);
568 result->SetOutput_data()
569 .AddField("in_frame_cds_possibly_matching_residues",
570 (int) cds_possible_match_count_by_frame[0]);
571 }
572
573 result->SetOutput_data()
574 .AddField("total_splices_in_alignment", (int) total_splices);
575 result->SetOutput_data()
576 .AddField("consensus_splices_in_alignment", (int) consensus_splices);
577 if (has_cds) {
578 result->SetOutput_data()
579 .AddField("total_cds_splices_in_alignment",
580 (int) total_cds_splices);
581 result->SetOutput_data()
582 .AddField("consensus_cds_splices_in_alignment",
583 (int) consensus_cds_splices);
584 }
585 result->SetOutput_data()
586 .AddField("5_prime_bases_not_aligned",
587 (int) disc.front()->GetSeqStart(0));
588 result->SetOutput_data()
589 .AddField("3_prime_bases_not_aligned",
590 (int) (xcript_hand.GetBioseqLength()
591 - disc.back()->GetSeqStop(0) - 1));
592 if (has_cds) {
593 result->SetOutput_data()
594 .AddField("start_codon_in_aligned_region",
595 disc.front()->GetSeqStart(0) <= cds_from);
596 result->SetOutput_data()
597 .AddField("stop_codon_in_aligned_region",
598 disc.back()->GetSeqStop(0) >= cds_to);
599 }
600
601 result->SetOutput_data()
602 .AddField("worst_exon_matches", int(worst_exon_match_count));
603 result->SetOutput_data()
604 .AddField("worst_exon_possible_matches",
605 int(worst_exon_possible_match_count));
606 result->SetOutput_data()
607 .AddField("worst_exon_length", int(worst_exon_length));
608
609 result->SetOutput_data()
610 .AddField("indel_count", indel_count);
611 result->SetOutput_data()
612 .AddField("cds_indel_count", cds_indel_count);
613
614 CSeq_data out_seq;
615 vector<TSeqPos> out_indices;
616 TSeqPos gac =
617 CSeqportUtil::GetAmbigs(genomic_seq_data, &out_seq, &out_indices);
618 result->SetOutput_data()
619 .AddField("genomic_ambiguity_count", int(gac));
620 TSeqPos gcac =
621 CSeqportUtil::GetAmbigs(genomic_cds_seq_data, &out_seq, &out_indices);
622 result->SetOutput_data()
623 .AddField("genomic_cds_ambiguity_count", int(gcac));
624
625
626 result->SetOutput_data()
627 .AddField("upstream_polya_priming",
628 static_cast<int>(s_GetPolyA_genomic_priming(*aln, scope, -200)));
629 result->SetOutput_data()
630 .AddField("downstream_polya_priming",
631 static_cast<int>(s_GetPolyA_genomic_priming(*aln, scope, 200)));
632
633 // Some tests involving the genomic sequence
634
635 const CSeq_align& first_exon = *disc.front();
636 const CSeq_align& last_exon = *disc.back();
637 bool is_minus = s_GetSeqStrand(first_exon, 1) == eNa_strand_minus;
638
639 // Distance from alignment limits to genomic sequence
640 // gap or end
641
642 TSeqPos genomic_earliest, genomic_latest;
643 if (is_minus) {
644 genomic_earliest = last_exon.GetSeqStart(1);
645 genomic_latest = first_exon.GetSeqStop(1);
646 } else {
647 genomic_earliest = first_exon.GetSeqStart(1);
648 genomic_latest = last_exon.GetSeqStop(1);
649 }
650 const CSeqMap& seq_map = genomic_hand.GetSeqMap();
651
652 CSeqMap_CI map_iter = seq_map.Begin(0);
653 TSeqPos last = 0;
654 while (map_iter.GetType() != CSeqMap::eSeqEnd) {
655 if (map_iter.GetPosition() > genomic_earliest) {
656 break;
657 }
658 if (map_iter.GetType() == CSeqMap::eSeqGap) {
659 last = map_iter.GetEndPosition() + 1;
660 }
661 ++map_iter;
662 }
663 result->SetOutput_data()
664 .AddField(is_minus ? "downstream_dist_to_genomic_gap_or_end"
665 : "upstream_dist_to_genomic_gap_or_end",
666 int(genomic_earliest) - int(last));
667
668 while (map_iter) {
669 ++map_iter;
670 }
671 --map_iter;
672 last = genomic_hand.GetBioseqLength() - 1;
673 while (map_iter.GetType() != CSeqMap::eSeqEnd) {
674 if (map_iter.GetEndPosition() < genomic_latest) {
675 break;
676 }
677 if (map_iter.GetType() == CSeqMap::eSeqGap) {
678 last = map_iter.GetPosition() - 1;
679 }
680 --map_iter;
681 }
682 result->SetOutput_data()
683 .AddField(is_minus ? "upstream_dist_to_genomic_gap_or_end"
684 : "downstream_dist_to_genomic_gap_or_end",
685 int(last) - int(genomic_latest));
686
687
688 // A nearby annotated upstream gene on same strand?
689 // Do this only for single-"exon" alignments because
690 // it's slow.
691 if (disc.size() == 1) {
692 TSeqPos kLargestGeneDist = 10000; // how far upstream to look
693 TSeqPos genomic_start;
694 TSeqPos genomic_roi_from; // region on genomic
695 TSeqPos genomic_roi_to; // that interests us
696
697 if (is_minus) {
698 genomic_start = first_exon.GetSeqStop(1);
699 if (genomic_start < genomic_hand.GetBioseqLength() - 1) {
700 genomic_roi_from = genomic_start + 1;
701 } else {
702 genomic_roi_from = genomic_start;
703 }
704 if (genomic_start + kLargestGeneDist + 1
705 < genomic_hand.GetBioseqLength()) {
706 genomic_roi_to = genomic_start + kLargestGeneDist + 1;
707 } else {
708 genomic_roi_to = genomic_hand.GetBioseqLength() - 1;
709 }
710 } else {
711 genomic_start = first_exon.GetSeqStart(1);
712 if (genomic_start > kLargestGeneDist) {
713 genomic_roi_from = genomic_start - kLargestGeneDist - 1;
714 } else {
715 genomic_roi_from = 0;
716 }
717 if (genomic_start > 1) {
718 genomic_roi_to = genomic_start - 1;
719 } else {
720 genomic_roi_to = 0;
721 }
722 }
723 SAnnotSelector gene_sel(CSeqFeatData::e_Gene);
724 gene_sel.SetResolveAll();
725 CFeat_CI it(scope,
726 *genomic_hand.GetRangeSeq_loc(genomic_roi_from,
727 genomic_roi_to),
728 gene_sel);
729 TSeqPos shortest_dist = kLargestGeneDist + 1;
730 while (it) {
731 const CSeq_loc& gene_loc = it->GetLocation();
732 if (is_minus) {
733 if (sequence::GetStrand(gene_loc) == eNa_strand_minus) {
734 if (sequence::GetStart(gene_loc, 0) > genomic_start) {
735 shortest_dist = sequence::GetStart(gene_loc, 0)
736 - genomic_start - 1;
737 }
738 }
739 } else {
740 if (sequence::GetStrand(gene_loc) != eNa_strand_minus) {
741 if (sequence::GetStop(gene_loc, 0) < genomic_start) {
742 shortest_dist = genomic_start
743 - sequence::GetStop(gene_loc, 0) - 1;
744 }
745 }
746 }
747 ++it;
748 }
749 result->SetOutput_data()
750 .AddField("has_nearby_upstream_gene_same_strand",
751 shortest_dist <= kLargestGeneDist);
752 if (shortest_dist <= kLargestGeneDist) {
753 result->SetOutput_data()
754 .AddField("distance_from_upstream_gene_same_strand",
755 int(shortest_dist));
756 }
757
758 }
759
760 return ref;
761 }
762
763
764 // Conversion from Spliced-seg to disc.
765 // This should probably eventually be moved, e.g., to CSeq_align.
766
767 static vector<TSignedSeqPos>
s_CalculateStarts(const vector<TSeqPos> & lens,ENa_strand strand,TSeqPos start,TSeqPos end)768 s_CalculateStarts(const vector<TSeqPos>& lens, ENa_strand strand,
769 TSeqPos start, TSeqPos end)
770 {
771 vector<TSignedSeqPos> rv;
772 rv.reserve(lens.size());
773 TSignedSeqPos offset = 0;
774 ITERATE (vector<TSeqPos>, len, lens) {
775 if (*len == 0) {
776 // a gap
777 rv.push_back(-1);
778 } else {
779 if (IsReverse(strand)) {
780 offset += *len;
781 rv.push_back((end + 1) - offset);
782 } else {
783 rv.push_back(start + offset);
784 offset += *len;
785 }
786 }
787 }
788 return rv;
789 }
790
791
792 static CRef<CDense_seg>
s_ExonToDenseg(const CSpliced_exon & exon,ENa_strand product_strand,ENa_strand genomic_strand,const CSeq_id & product_id,const CSeq_id & genomic_id)793 s_ExonToDenseg(const CSpliced_exon& exon,
794 ENa_strand product_strand, ENa_strand genomic_strand,
795 const CSeq_id& product_id, const CSeq_id& genomic_id)
796 {
797 CRef<CDense_seg> ds(new CDense_seg);
798
799 vector<TSeqPos> product_lens;
800 vector<TSeqPos> genomic_lens;
801 ITERATE (CSpliced_exon::TParts, iter, exon.GetParts()) {
802 const CSpliced_exon_chunk& part = **iter;
803 if (part.IsMatch()) {
804 product_lens.push_back(part.GetMatch());
805 genomic_lens.push_back(part.GetMatch());
806 } else if (part.IsMismatch()) {
807 product_lens.push_back(part.GetMismatch());
808 genomic_lens.push_back(part.GetMismatch());
809 } else if (part.IsDiag()) {
810 product_lens.push_back(part.GetDiag());
811 genomic_lens.push_back(part.GetDiag());
812 } else if (part.IsProduct_ins()) {
813 product_lens.push_back(part.GetProduct_ins());
814 genomic_lens.push_back(0);
815 } else if (part.IsGenomic_ins()) {
816 product_lens.push_back(0);
817 genomic_lens.push_back(part.GetGenomic_ins());
818 } else {
819 throw runtime_error("unhandled part type in Spliced-enon");
820 }
821 }
822
823 CDense_seg::TLens& lens = ds->SetLens();
824 lens.reserve(product_lens.size());
825 for (unsigned int i = 0; i < product_lens.size(); ++i) {
826 lens.push_back(max(product_lens[i], genomic_lens[i]));
827 }
828 vector<TSignedSeqPos> product_starts =
829 s_CalculateStarts(product_lens, product_strand,
830 exon.GetProduct_start().GetNucpos(),
831 exon.GetProduct_end().GetNucpos());
832 vector<TSignedSeqPos> genomic_starts =
833 s_CalculateStarts(genomic_lens, genomic_strand,
834 exon.GetGenomic_start(),
835 exon.GetGenomic_end());
836
837 CDense_seg::TStarts& starts = ds->SetStarts();
838 starts.reserve(product_starts.size() + genomic_starts.size());
839 for (unsigned int i = 0; i < lens.size(); ++i) {
840 starts.push_back(product_starts[i]); // product row first
841 starts.push_back(genomic_starts[i]);
842 }
843 ds->SetIds().push_back(CRef<CSeq_id>(SerialClone(product_id)));
844 ds->SetIds().push_back(CRef<CSeq_id>(SerialClone(genomic_id)));
845
846 // Set strands, unless they're both plus
847 if (!(product_strand == eNa_strand_plus
848 && genomic_strand == eNa_strand_plus)) {
849 CDense_seg::TStrands& strands = ds->SetStrands();
850 for (unsigned int i = 0; i < lens.size(); ++i) {
851 strands.push_back(product_strand);
852 strands.push_back(genomic_strand);
853 }
854 }
855
856 ds->SetNumseg(lens.size());
857
858 ds->Compact(); // join adjacent match/mismatch/diag parts
859 return ds;
860 }
861
862
s_SplicedToDisc(const CSeq_align & spliced_seg_aln)863 static CRef<CSeq_align> s_SplicedToDisc(const CSeq_align& spliced_seg_aln)
864 {
865
866 CRef<CSeq_align> disc(new CSeq_align);
867 disc->SetType(CSeq_align::eType_disc);
868
869 const CSpliced_seg& spliced_seg = spliced_seg_aln.GetSegs().GetSpliced();
870
871 ENa_strand product_strand = spliced_seg.GetProduct_strand();
872 ENa_strand genomic_strand = spliced_seg.GetGenomic_strand();
873 const CSeq_id& product_id = spliced_seg.GetProduct_id();
874 const CSeq_id& genomic_id = spliced_seg.GetGenomic_id();
875
876 //for exon in spliced_seg.GetSegs().GetSpliced().GetExons()[:] {
877 ITERATE (CSpliced_seg::TExons, exon, spliced_seg.GetExons()) {
878 CRef<CDense_seg> ds = s_ExonToDenseg(**exon,
879 product_strand, genomic_strand,
880 product_id, genomic_id);
881 CRef<CSeq_align> ds_align(new CSeq_align);
882 ds_align->SetSegs().SetDenseg(*ds);
883 ds_align->SetType(CSeq_align::eType_partial);
884 disc->SetSegs().SetDisc().Set().push_back(ds_align);
885 }
886 return disc;
887 }
888
889
890 END_NCBI_SCOPE
891