1 /*  $Id: single_aln_tests.cpp 353529 2012-02-16 18:22:04Z astashya $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Josh Cherry
27  *
28  * File Description:  Tests operating on an alignment of a transcript to
29  *                      genomic DNA
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <algo/seqqa/single_aln_tests.hpp>
35 #include <objects/seqalign/Seq_align.hpp>
36 #include <objects/seqloc/Seq_interval.hpp>
37 #include <objects/seq/Seq_data.hpp>
38 #include <objects/seq/IUPACna.hpp>
39 #include <objtools/alnmgr/alnvec.hpp>
40 #include <objects/seqalign/Seq_align_set.hpp>
41 #include <objects/seqloc/Seq_id.hpp>
42 #include <objects/seqalign/Seq_align.hpp>
43 #include <objects/seqloc/Seq_loc.hpp>
44 #include <objects/seqalign/Dense_seg.hpp>
45 #include <objects/seqalign/Spliced_exon.hpp>
46 #include <objects/seqalign/Spliced_seg.hpp>
47 #include <objects/seqalign/Spliced_exon_chunk.hpp>
48 #include <objects/seqalign/Product_pos.hpp>
49 #include <objects/seqfeat/Seq_feat.hpp>
50 #include <objects/seqfeat/Cdregion.hpp>
51 #include <objects/seqfeat/Code_break.hpp>
52 #include <objects/general/Object_id.hpp>
53 #include <objects/general/User_object.hpp>
54 #include <objects/seq/seqport_util.hpp>
55 #include <objmgr/seq_vector.hpp>
56 #include <objmgr/bioseq_handle.hpp>
57 #include <objmgr/annot_selector.hpp>
58 #include <objmgr/feat_ci.hpp>
59 #include <objmgr/util/sequence.hpp>
60 #include <objmgr/seq_loc_mapper.hpp>
61 #include <algo/sequence/consensus_splice.hpp>
62 
63 BEGIN_NCBI_SCOPE
64 USING_SCOPE(objects);
65 
CanTest(const CSerialObject & obj,const CSeqTestContext * ctx) const66 bool CTestSingleAln::CanTest(const CSerialObject& obj,
67                              const CSeqTestContext* ctx) const
68 {
69     const CSeq_align* aln = dynamic_cast<const CSeq_align*>(&obj);
70     if (aln) {
71         // Can analyze a disc alignment...
72         if (aln->GetType() == ncbi::CSeq_align::eType_disc) {
73             return true;
74         }
75         // ...or a Spliced-seg where product is a transcript
76         if (aln->GetSegs().IsSpliced()) {
77             if (aln->GetSegs().GetSpliced().GetProduct_type()
78                 == CSpliced_seg::eProduct_type_transcript) {
79                 return true;
80             }
81         }
82     }
83     return false;
84 }
85 
86 
87 // Like CSeq_align::GetSeqStrand, but if the strand is not set,
88 // returns eNa_strand_plus rather than throwing an exception
s_GetSeqStrand(const CSeq_align & aln,CSeq_align::TDim row)89 static ENa_strand s_GetSeqStrand(const CSeq_align& aln, CSeq_align::TDim row)
90 {
91     try {
92         return aln.GetSeqStrand(row);
93     } catch(CSeqalignException&) {
94         return eNa_strand_plus;
95     }
96 }
97 
98 
99 static CRef<CSeq_align> s_SplicedToDisc(const CSeq_align& spliced_seg_aln);
100 
101 
102 //if neighborhood < 0 - search upstream of cleavage; otherwise downstream.
s_GetPolyA_genomic_priming(const CSeq_align & aln,CScope & scope,int neighborhood)103 static size_t s_GetPolyA_genomic_priming(const CSeq_align& aln, CScope& scope, int neighborhood)
104 {
105     CBioseq_Handle bsh = scope.GetBioseqHandle(aln.GetSeq_id(1));
106 
107     CRef<CSeq_loc> loc(new CSeq_loc); //upstream or downstream neighborhood loc
108     {{
109         ENa_strand strand = s_GetSeqStrand(aln, 1);
110 
111         //cleavage position
112         TSeqPos pos = strand == eNa_strand_minus ? aln.GetSeqStart(1) : aln.GetSeqStop(1);
113 
114         //if searching downstream of cleavage, start with first pos past end of alignment
115         if(neighborhood > 0 && pos > 0 && pos < bsh.GetInst_Length() - 1) {
116             pos += strand == eNa_strand_minus ? -1 : +1;
117         }
118 
119         loc->SetInt().SetId().Assign(aln.GetSeq_id(1));
120         loc->SetInt().SetFrom(pos);
121         loc->SetInt().SetTo(pos);
122         loc->SetInt().SetStrand(strand);
123 
124         if(   (neighborhood >= 0 && strand != eNa_strand_minus)
125            || (neighborhood < 0  && strand == eNa_strand_minus))
126         {
127             loc->SetInt().SetTo(min(bsh.GetInst_Length() - 1, pos + abs(neighborhood)));
128         } else {
129             loc->SetInt().SetFrom(pos < abs(neighborhood) ? 0 : pos - abs(neighborhood));
130         }
131     }}
132 
133     CSeqVector vec(*loc, scope, CBioseq_Handle::eCoding_Iupac);
134     string seq("");
135 
136     vec.GetSeqData(vec.begin(), vec.end(), seq);
137     if(neighborhood < 0) {
138         reverse(seq.begin(), seq.end());
139     }
140 
141     size_t best_pos(NPOS);
142     {{
143         static const int w_match = 1;
144         static const int w_mismatch = -4;
145         static const int x_dropoff = 15;
146 
147         int best_score = 0;
148         int curr_score = 0;
149 
150         for(size_t curr_pos = 0;
151             curr_pos < seq.size() && curr_score + x_dropoff > best_score;
152             ++curr_pos)
153         {
154             curr_score += seq[curr_pos] == 'A' ? w_match : w_mismatch;
155             if(curr_score >= best_score) {
156                 best_score = curr_score;
157                 best_pos = curr_pos;
158             }
159         }
160     }}
161     size_t priming_length = best_pos == NPOS ? 0 : best_pos + 1;
162 
163     //string label;
164     //loc->GetLabel(&label);
165     //LOG_POST(aln.GetSeq_id(0).AsFastaString() << "\t" << label << "\t" << neighborhood << "\t" << seq << "\t" << priming_length);
166 
167     return priming_length;
168 }
169 
170 
171 
172 CRef<CSeq_test_result_set>
RunTest(const CSerialObject & obj,const CSeqTestContext * ctx)173 CTestSingleAln_All::RunTest(const CSerialObject& obj,
174                             const CSeqTestContext* ctx)
175 {
176     CRef<CSeq_test_result_set> ref;
177     CConstRef<CSeq_align> aln(dynamic_cast<const CSeq_align*>(&obj));
178     if ( !aln  ||  !ctx) {
179         return ref;
180     }
181 
182     // Handle a Spliced-seg by converting it to a disc
183     if (aln->GetSegs().IsSpliced()) {
184         aln = s_SplicedToDisc(*aln);
185     } else if(!aln->GetSegs().IsDisc()) {
186         NCBI_THROW(CException, eUnknown, "Expected spliced or disc alignment");
187     }
188 
189     ref.Reset(new CSeq_test_result_set());
190 
191     CRef<CSeq_test_result> result = x_SkeletalTestResult("singlealn_all");
192     ref->Set().push_back(result);
193 
194     CScope& scope = ctx->GetScope();
195 
196     const CSeq_align_set::Tdata& disc = aln->GetSegs().GetDisc().Get();
197     result->SetOutput_data()
198         .AddField("exon_count", (int) disc.size());
199 
200     CBioseq_Handle xcript_hand
201         = scope.GetBioseqHandle(disc.front()->GetSeq_id(0));
202     SAnnotSelector sel(CSeqFeatData::eSubtype_cdregion);
203     sel.SetResolveDepth(0);
204     CFeat_CI it(xcript_hand, sel);
205     bool has_cds(it);
206     TSeqPos cds_from = 0, cds_to = 0;  // initialize to avoid compiler warning
207     CAlnVec::TSignedRange cds_range;
208     const CSeq_id& genomic_id = disc.front()->GetSeq_id(1);
209     CBioseq_Handle genomic_hand = scope.GetBioseqHandle(genomic_id);
210     if (has_cds) {
211         const CSeq_loc& cds_loc = it->GetLocation();
212         cds_from = sequence::GetStart(cds_loc, 0);
213         cds_to   = sequence::GetStop(cds_loc, 0);
214         cds_range.SetFrom(static_cast<TSignedSeqPos>(cds_from));
215         cds_range.SetTo(static_cast<TSignedSeqPos>(cds_to));
216 
217         // Assess whether differences between transcript and genome
218         // would affect the protein produced
219         CSeq_loc_Mapper mapper(*aln, 1, &scope);
220         CRef<CSeq_loc> genomic_cds_loc = mapper.Map(cds_loc);
221         const CGenetic_code* code = 0;
222         if (it->GetData().GetCdregion().CanGetCode()) {
223             code = &it->GetData().GetCdregion().GetCode();
224         }
225         string xcript_prot, genomic_prot;
226         CSeqTranslator::Translate(cds_loc, scope, xcript_prot, code);
227         CSeqTranslator::Translate(*genomic_cds_loc, scope, genomic_prot, code);
228         // Say that they "can make same prot" iff the translations
229         // are the same AND do not contain ambiguities
230         bool can_make_same_prot = false;
231         if (xcript_prot == genomic_prot &&
232             xcript_prot.find_first_not_of("ACDEFGHIKLMNPQRSTVWY*") == NPOS) {
233                                   // no need to check genomic (it's equal)
234 
235             can_make_same_prot = true;  // provisionally true, but...
236 
237             // Demand that any code-breaks annotated on transcript
238             // are identical at the nucleotide level in the genomic
239             if (it->GetData().GetCdregion().IsSetCode_break()) {
240                 ITERATE (CCdregion::TCode_break, cb,
241                          it->GetMappedFeature().GetData().GetCdregion()
242                          .GetCode_break()) {
243                     const CCode_break& code_break = **cb;
244                     const CSeq_loc& xcript_cb_loc = code_break.GetLoc();
245                     CRef<CSeq_loc> genomic_cb_loc = mapper.Map(xcript_cb_loc);
246 
247                     CSeqVector gvec(*genomic_cb_loc, scope);
248                     string gseq;
249                     gvec.GetSeqData(0, gvec.size(), gseq);
250 
251                     CSeqVector xvec(xcript_cb_loc, scope);
252                     string xseq;
253                     xvec.GetSeqData(0, xvec.size(), xseq);
254 
255                     if (gseq != xseq) {
256                         can_make_same_prot = false;
257                         break;
258                     }
259                 }
260             }
261         }
262         result->SetOutput_data()
263             .AddField("can_make_same_prot", can_make_same_prot);
264     }
265 
266 
267     TSeqPos last_genomic_end = 0;
268     TSeqPos aligned_residue_count = 0;
269     TSeqPos cds_aligned_residue_count = 0;
270     TSeqPos match_count = 0, possible_match_count = 0;
271     TSeqPos cds_match_count = 0, cds_possible_match_count = 0;
272     vector<int> cds_match_count_by_frame(3);
273     vector<int> cds_possible_match_count_by_frame(3);
274     TSeqPos total_splices = 0;
275     TSeqPos consensus_splices = 0;
276     TSeqPos total_cds_splices = 0;
277     TSeqPos consensus_cds_splices = 0;
278 
279     int exon_index = 0;
280     TSeqPos min_exon_length = 0, max_exon_length = 0;      // initialize to
281     TSeqPos min_intron_length = 0, max_intron_length = 0;  // avoid warnings
282     TSeqPos exon_length, intron_length;
283     TSeqPos exon_match_count, exon_possible_match_count;
284     double worst_exon_match_frac = 1.1;  // guarantee that something is worse
285     TSeqPos worst_exon_match_count = 0;           // initialize to
286     TSeqPos worst_exon_possible_match_count = 0;  // avoid
287     TSeqPos worst_exon_length = 0;                // compiler warnings
288     int indel_count = 0;
289     int cds_indel_count = 0;
290     TSeqPos exon_length_3p = 0;
291     TSeqPos exon_length_5p = 0;
292 
293     // These will be used for counting genomic ambiguities
294     CSeq_data genomic_seq_data;
295     string& genomic_seq = genomic_seq_data.SetIupacna().Set();
296     CSeq_data genomic_cds_seq_data;
297     string& genomic_cds_seq = genomic_cds_seq_data.SetIupacna().Set();
298 
299     int lag = 0;  // cumulative gaps in xcript minus genomic in cds;
300                   // initialize to avoid compiler warnings
301     int frame;
302 
303     ITERATE (CSeq_align_set::Tdata, iter, disc) {
304         const CSeq_align& exon = **iter;
305         CRange<TSeqPos> r = exon.GetSeqRange(0);
306         exon_length = r.GetLength();
307 
308         if(exon_index == 0) {
309             exon_length_5p = exon_length;
310         }
311         exon_length_3p = exon_length;
312 
313         if (exon_index == 0 || exon_length > max_exon_length) {
314             max_exon_length = exon_length;
315         }
316         if (exon_index == 0 || exon_length < min_exon_length) {
317             min_exon_length = exon_length;
318         }
319 
320         if (s_GetSeqStrand(exon, 1) == eNa_strand_minus) {
321             intron_length = last_genomic_end - exon.GetSeqStop(1)- 1;
322         } else {
323             intron_length = exon.GetSeqStart(1) - last_genomic_end - 1;
324         }
325 
326         if (exon_index > 0) {
327             // intron length
328             if (exon_index == 1 || intron_length > max_intron_length) {
329                 max_intron_length = intron_length;
330             }
331             if (exon_index == 1 || intron_length < min_intron_length) {
332                 min_intron_length = intron_length;
333             }
334 
335             // splice consensus
336             CSeq_loc loc;
337             loc.SetInt().SetId().Assign(exon.GetSeq_id(1));
338             if (s_GetSeqStrand(exon, 1) == eNa_strand_minus) {
339                 loc.SetInt().SetFrom(last_genomic_end - 2);
340                 loc.SetInt().SetTo  (last_genomic_end - 1);
341                 loc.SetInt().SetStrand(eNa_strand_minus);
342             } else {
343                 loc.SetInt().SetFrom(last_genomic_end + 1);
344                 loc.SetInt().SetTo  (last_genomic_end + 2);
345             }
346             CSeqVector vec5(loc, scope);
347             vec5.SetIupacCoding();
348             string splice5;
349             vec5.GetSeqData(0, 2, splice5);
350 
351             if (s_GetSeqStrand(exon, 1) == eNa_strand_minus) {
352                 loc.SetInt().SetFrom(exon.GetSeqStop(1) + 1);
353                 loc.SetInt().SetTo  (exon.GetSeqStop(1) + 2);
354                 loc.SetInt().SetStrand(eNa_strand_minus);
355             } else {
356                 loc.SetInt().SetFrom(exon.GetSeqStart(1) - 2);
357                 loc.SetInt().SetTo  (exon.GetSeqStart(1) - 1);
358             }
359             CSeqVector vec3(loc, scope);
360             vec3.SetIupacCoding();
361             string splice3;
362             vec3.GetSeqData(0, 2, splice3);
363 
364             bool consensus_splice = IsConsensusSplice(splice5, splice3);
365             total_splices += 1;
366             if (consensus_splice) {
367                 ++consensus_splices;
368             }
369             if (has_cds) {
370                 if (cds_from < exon.GetSeqStart(0)
371                     && cds_to >= exon.GetSeqStart(0)) {
372                     ++total_cds_splices;
373                     if (consensus_splice) {
374                         ++consensus_cds_splices;
375                     }
376                 }
377             }
378         } // exon_index > 0
379 
380         if (s_GetSeqStrand(exon, 1) == eNa_strand_minus) {
381             last_genomic_end = exon.GetSeqStart(1);
382         } else {
383             last_genomic_end = exon.GetSeqStop(1);
384         }
385 
386         if (cds_from >= exon.GetSeqStart(0)
387             && cds_from <= exon.GetSeqStop(0)) {
388             result->SetOutput_data()
389                 .AddField("introns_5_prime_of_start", exon_index);
390         }
391         if (cds_to >= exon.GetSeqStart(0)
392             && cds_to <= exon.GetSeqStop(0)) {
393             unsigned int downstream_intron_count =
394                 disc.size() - exon_index - 1;
395             result->SetOutput_data()
396                 .AddField("introns_3_prime_of_stop",
397                           (int) downstream_intron_count);
398             if (downstream_intron_count > 0) {
399                 result->SetOutput_data()
400                     .AddField("dist_stop_to_exon_end",
401                               int(exon.GetSeqStop(0) - cds_to));
402                 result->SetOutput_data()
403                     .AddField("dist_stop_to_last_intron",
404                               int((*++disc.rbegin())->GetSeqStop(0) - cds_to));
405             }
406         }
407 
408         // count of transcript residues doing various things
409         CAlnVec avec(exon.GetSegs().GetDenseg(), scope);
410         avec.SetGapChar('-');
411         avec.SetEndChar('-');
412         exon_match_count = 0;
413         exon_possible_match_count = 0;
414 
415         // need to be careful here because segment may begin with gap
416         bool in_cds = has_cds
417             && avec.GetSeqStart(0) > cds_from
418             && avec.GetSeqStart(0) <= cds_to;
419 
420         for (TSeqPos i = 0;  i <= avec.GetAlnStop();  ++i) {
421             bool gap_in_xcript  = avec.GetResidue(0, i) == '-';
422             bool gap_in_genomic = avec.GetResidue(1, i) == '-';
423 
424             if (!gap_in_xcript) {
425                 TSeqPos pos = avec.GetSeqPosFromAlnPos(0, i);
426                 if (has_cds) {
427                     in_cds = pos >= cds_from && pos <= cds_to;
428                 }
429                 if (!gap_in_genomic) {
430                     ++aligned_residue_count;
431                     if (in_cds) {
432                         ++cds_aligned_residue_count;
433                     }
434                     if (avec.GetResidue(0, i) == avec.GetResidue(1, i)) {
435                         ++exon_match_count;
436                         if (in_cds) {
437                             if (cds_match_count == 0
438                                 && cds_possible_match_count == 0) {
439                                 // this is first cds match (probably start)
440                                 lag = 0;
441                             }
442                             ++cds_match_count;
443                             frame = lag % 3;
444                             if (frame < 0) {
445                                 frame += 3;
446                             }
447                             ++cds_match_count_by_frame[frame];
448                         }
449                     } else if (!gap_in_xcript) {
450                         unsigned char cdna_res =
451                             CAlnVec::FromIupac(avec.GetResidue(0, i));
452                         unsigned char genomic_res =
453                             CAlnVec::FromIupac(avec.GetResidue(1, i));
454                         // count as a possible match if
455                         // cdna is a subset of genomic
456                         if (!(cdna_res & ~genomic_res)) {
457                             ++exon_possible_match_count;
458                             if (in_cds) {
459                                 if (cds_match_count == 0
460                                     && cds_possible_match_count == 0) {
461                                     // this is first cds match (probably start)
462                                     lag = 0;
463                                 }
464                                 ++cds_possible_match_count;
465                                 frame = lag % 3;
466                                 if (frame < 0) {
467                                     frame += 3;
468                                 }
469                                 ++cds_possible_match_count_by_frame[frame];
470                             }
471                         }
472                     }
473                 }
474             }
475             if (gap_in_xcript && in_cds) {
476                 ++lag;
477             }
478             if (gap_in_genomic && in_cds) {
479                 --lag;
480             }
481             if (!gap_in_genomic && in_cds) {
482                 genomic_cds_seq.push_back(avec.GetResidue(1, i));
483             }
484         }
485         match_count += exon_match_count;
486         possible_match_count += exon_possible_match_count;
487 
488         // look for "worst" exon
489         double exon_match_frac =
490             double(exon_match_count + exon_possible_match_count) / exon_length;
491         if (exon_match_frac < worst_exon_match_frac) {
492             worst_exon_match_frac = exon_match_frac;
493             worst_exon_match_count = exon_match_count;
494             worst_exon_possible_match_count = exon_possible_match_count;
495             worst_exon_length = exon_length;
496         }
497 
498         // count indels (count as 1, regardless of length)
499         in_cds = has_cds
500             && avec.GetSeqStart(0) > cds_from
501             && avec.GetSeqStart(0) <= cds_to;  // only for mRNA gaps
502 
503         for (int seg = 0; seg < avec.GetNumSegs();  ++seg) {
504             if (avec.GetStart(0, seg) == -1) {
505                 ++indel_count;
506                 if (in_cds) {
507                     ++cds_indel_count;
508                 }
509             } else {
510                 if (avec.GetStart(1, seg) == -1) {
511                     ++indel_count;
512                     // any overlap of segment with cds counts
513                     if (has_cds
514                         && cds_range.IntersectingWith(avec.GetRange(0, seg))) {
515                         ++cds_indel_count;
516                     }
517                 }
518                 in_cds = has_cds
519                     && avec.GetStop(0, seg)
520                     >= static_cast<TSignedSeqPos>(cds_from)
521                     && avec.GetStop(0, seg)
522                     < static_cast<TSignedSeqPos>(cds_to);
523             }
524         }
525 
526         string exon_genomic_seq;
527         avec.GetSeqString(exon_genomic_seq, 1,
528                           avec.GetSeqStart(1), avec.GetSeqStop(1));
529         genomic_seq += exon_genomic_seq;
530 
531         ++exon_index;
532     }
533 
534 
535     result->SetOutput_data()
536         .AddField("max_exon_length", (int) max_exon_length);
537     result->SetOutput_data()
538         .AddField("min_exon_length", (int) min_exon_length);
539 
540     result->SetOutput_data()
541         .AddField("5p_terminal_exon_length", (int) exon_length_5p);
542     result->SetOutput_data()
543         .AddField("3p_terminal_exon_length", (int) exon_length_3p);
544 
545     if (disc.size() > 1) {
546         result->SetOutput_data()
547             .AddField("max_intron_length", (int) max_intron_length);
548         result->SetOutput_data()
549             .AddField("min_intron_length", (int) min_intron_length);
550     }
551     result->SetOutput_data()
552         .AddField("aligned_residues", (int) aligned_residue_count);
553     result->SetOutput_data()
554         .AddField("matching_residues", (int) match_count);
555     result->SetOutput_data()
556         .AddField("possibly_matching_residues", (int) possible_match_count);
557     if (has_cds) {
558         result->SetOutput_data()
559             .AddField("cds_matching_residues", (int) cds_match_count);
560         result->SetOutput_data()
561             .AddField("cds_possibly_matching_residues",
562                       (int) cds_possible_match_count);
563         result->SetOutput_data()
564             .AddField("cds_aligned_residues", (int) cds_aligned_residue_count);
565         result->SetOutput_data()
566             .AddField("in_frame_cds_matching_residues",
567                       (int) cds_match_count_by_frame[0]);
568         result->SetOutput_data()
569             .AddField("in_frame_cds_possibly_matching_residues",
570                       (int) cds_possible_match_count_by_frame[0]);
571     }
572 
573     result->SetOutput_data()
574         .AddField("total_splices_in_alignment", (int) total_splices);
575     result->SetOutput_data()
576         .AddField("consensus_splices_in_alignment", (int) consensus_splices);
577     if (has_cds) {
578         result->SetOutput_data()
579             .AddField("total_cds_splices_in_alignment",
580                       (int) total_cds_splices);
581         result->SetOutput_data()
582             .AddField("consensus_cds_splices_in_alignment",
583                       (int) consensus_cds_splices);
584     }
585     result->SetOutput_data()
586         .AddField("5_prime_bases_not_aligned",
587                   (int) disc.front()->GetSeqStart(0));
588     result->SetOutput_data()
589         .AddField("3_prime_bases_not_aligned",
590                   (int) (xcript_hand.GetBioseqLength()
591                   - disc.back()->GetSeqStop(0) - 1));
592     if (has_cds) {
593         result->SetOutput_data()
594             .AddField("start_codon_in_aligned_region",
595                       disc.front()->GetSeqStart(0) <= cds_from);
596         result->SetOutput_data()
597             .AddField("stop_codon_in_aligned_region",
598                       disc.back()->GetSeqStop(0) >= cds_to);
599     }
600 
601     result->SetOutput_data()
602         .AddField("worst_exon_matches", int(worst_exon_match_count));
603     result->SetOutput_data()
604         .AddField("worst_exon_possible_matches",
605                   int(worst_exon_possible_match_count));
606     result->SetOutput_data()
607         .AddField("worst_exon_length", int(worst_exon_length));
608 
609     result->SetOutput_data()
610         .AddField("indel_count", indel_count);
611     result->SetOutput_data()
612         .AddField("cds_indel_count", cds_indel_count);
613 
614     CSeq_data out_seq;
615     vector<TSeqPos> out_indices;
616     TSeqPos gac =
617         CSeqportUtil::GetAmbigs(genomic_seq_data, &out_seq, &out_indices);
618     result->SetOutput_data()
619         .AddField("genomic_ambiguity_count", int(gac));
620     TSeqPos gcac =
621         CSeqportUtil::GetAmbigs(genomic_cds_seq_data, &out_seq, &out_indices);
622     result->SetOutput_data()
623         .AddField("genomic_cds_ambiguity_count", int(gcac));
624 
625 
626     result->SetOutput_data()
627         .AddField("upstream_polya_priming",
628                   static_cast<int>(s_GetPolyA_genomic_priming(*aln, scope, -200)));
629     result->SetOutput_data()
630         .AddField("downstream_polya_priming",
631                   static_cast<int>(s_GetPolyA_genomic_priming(*aln, scope, 200)));
632 
633     // Some tests involving the genomic sequence
634 
635     const CSeq_align& first_exon = *disc.front();
636     const CSeq_align& last_exon = *disc.back();
637     bool is_minus = s_GetSeqStrand(first_exon, 1) == eNa_strand_minus;
638 
639     // Distance from alignment limits to genomic sequence
640     // gap or end
641 
642     TSeqPos genomic_earliest, genomic_latest;
643     if (is_minus) {
644         genomic_earliest = last_exon.GetSeqStart(1);
645         genomic_latest = first_exon.GetSeqStop(1);
646     } else {
647         genomic_earliest = first_exon.GetSeqStart(1);
648         genomic_latest = last_exon.GetSeqStop(1);
649     }
650     const CSeqMap& seq_map = genomic_hand.GetSeqMap();
651 
652     CSeqMap_CI map_iter = seq_map.Begin(0);
653     TSeqPos last = 0;
654     while (map_iter.GetType() != CSeqMap::eSeqEnd) {
655         if (map_iter.GetPosition() > genomic_earliest) {
656             break;
657         }
658         if (map_iter.GetType() == CSeqMap::eSeqGap) {
659             last = map_iter.GetEndPosition() + 1;
660         }
661         ++map_iter;
662     }
663     result->SetOutput_data()
664         .AddField(is_minus ? "downstream_dist_to_genomic_gap_or_end"
665                   : "upstream_dist_to_genomic_gap_or_end",
666                   int(genomic_earliest) - int(last));
667 
668     while (map_iter) {
669         ++map_iter;
670     }
671     --map_iter;
672     last = genomic_hand.GetBioseqLength() - 1;
673     while (map_iter.GetType() != CSeqMap::eSeqEnd) {
674         if (map_iter.GetEndPosition() < genomic_latest) {
675             break;
676         }
677         if (map_iter.GetType() == CSeqMap::eSeqGap) {
678             last = map_iter.GetPosition() - 1;
679         }
680         --map_iter;
681     }
682     result->SetOutput_data()
683         .AddField(is_minus ? "upstream_dist_to_genomic_gap_or_end"
684                   : "downstream_dist_to_genomic_gap_or_end",
685                   int(last) - int(genomic_latest));
686 
687 
688     // A nearby annotated upstream gene on same strand?
689     // Do this only for single-"exon" alignments because
690     // it's slow.
691     if (disc.size() == 1) {
692         TSeqPos kLargestGeneDist = 10000;  // how far upstream to look
693         TSeqPos genomic_start;
694         TSeqPos genomic_roi_from;  // region on genomic
695         TSeqPos genomic_roi_to;    // that interests us
696 
697         if (is_minus) {
698             genomic_start = first_exon.GetSeqStop(1);
699             if (genomic_start < genomic_hand.GetBioseqLength() - 1) {
700                 genomic_roi_from = genomic_start + 1;
701             } else {
702                 genomic_roi_from = genomic_start;
703             }
704             if (genomic_start + kLargestGeneDist + 1
705                 < genomic_hand.GetBioseqLength()) {
706                 genomic_roi_to = genomic_start + kLargestGeneDist + 1;
707             } else {
708                 genomic_roi_to = genomic_hand.GetBioseqLength() - 1;
709             }
710         } else {
711             genomic_start = first_exon.GetSeqStart(1);
712             if (genomic_start > kLargestGeneDist) {
713                 genomic_roi_from = genomic_start - kLargestGeneDist - 1;
714             } else {
715                 genomic_roi_from = 0;
716             }
717             if (genomic_start > 1) {
718                 genomic_roi_to = genomic_start - 1;
719             } else {
720                 genomic_roi_to = 0;
721             }
722         }
723         SAnnotSelector gene_sel(CSeqFeatData::e_Gene);
724         gene_sel.SetResolveAll();
725         CFeat_CI it(scope,
726                     *genomic_hand.GetRangeSeq_loc(genomic_roi_from,
727                                                   genomic_roi_to),
728                     gene_sel);
729         TSeqPos shortest_dist = kLargestGeneDist + 1;
730         while (it) {
731             const CSeq_loc& gene_loc = it->GetLocation();
732             if (is_minus) {
733                 if (sequence::GetStrand(gene_loc) == eNa_strand_minus) {
734                     if (sequence::GetStart(gene_loc, 0) > genomic_start) {
735                         shortest_dist = sequence::GetStart(gene_loc, 0)
736                             - genomic_start - 1;
737                     }
738                 }
739             } else {
740                 if (sequence::GetStrand(gene_loc) != eNa_strand_minus) {
741                     if (sequence::GetStop(gene_loc, 0) < genomic_start) {
742                         shortest_dist = genomic_start
743                             - sequence::GetStop(gene_loc, 0) - 1;
744                     }
745                 }
746             }
747             ++it;
748         }
749         result->SetOutput_data()
750             .AddField("has_nearby_upstream_gene_same_strand",
751                       shortest_dist <= kLargestGeneDist);
752         if (shortest_dist <= kLargestGeneDist) {
753             result->SetOutput_data()
754                 .AddField("distance_from_upstream_gene_same_strand",
755                           int(shortest_dist));
756         }
757 
758     }
759 
760     return ref;
761 }
762 
763 
764 // Conversion from Spliced-seg to disc.
765 // This should probably eventually be moved, e.g., to CSeq_align.
766 
767 static vector<TSignedSeqPos>
s_CalculateStarts(const vector<TSeqPos> & lens,ENa_strand strand,TSeqPos start,TSeqPos end)768 s_CalculateStarts(const vector<TSeqPos>& lens, ENa_strand strand,
769                   TSeqPos start, TSeqPos end)
770 {
771     vector<TSignedSeqPos> rv;
772     rv.reserve(lens.size());
773     TSignedSeqPos offset = 0;
774     ITERATE (vector<TSeqPos>, len, lens) {
775         if (*len == 0) {
776             // a gap
777             rv.push_back(-1);
778         } else {
779             if (IsReverse(strand)) {
780                 offset += *len;
781                 rv.push_back((end + 1) - offset);
782             } else {
783                 rv.push_back(start + offset);
784                 offset += *len;
785             }
786         }
787     }
788     return rv;
789 }
790 
791 
792 static CRef<CDense_seg>
s_ExonToDenseg(const CSpliced_exon & exon,ENa_strand product_strand,ENa_strand genomic_strand,const CSeq_id & product_id,const CSeq_id & genomic_id)793 s_ExonToDenseg(const CSpliced_exon& exon,
794                ENa_strand product_strand, ENa_strand genomic_strand,
795                const CSeq_id& product_id, const CSeq_id& genomic_id)
796 {
797     CRef<CDense_seg> ds(new CDense_seg);
798 
799     vector<TSeqPos> product_lens;
800     vector<TSeqPos> genomic_lens;
801     ITERATE (CSpliced_exon::TParts, iter, exon.GetParts()) {
802         const CSpliced_exon_chunk& part = **iter;
803         if (part.IsMatch()) {
804             product_lens.push_back(part.GetMatch());
805             genomic_lens.push_back(part.GetMatch());
806         } else if (part.IsMismatch()) {
807             product_lens.push_back(part.GetMismatch());
808             genomic_lens.push_back(part.GetMismatch());
809         } else if (part.IsDiag()) {
810             product_lens.push_back(part.GetDiag());
811             genomic_lens.push_back(part.GetDiag());
812         } else if (part.IsProduct_ins()) {
813             product_lens.push_back(part.GetProduct_ins());
814             genomic_lens.push_back(0);
815         } else if (part.IsGenomic_ins()) {
816             product_lens.push_back(0);
817             genomic_lens.push_back(part.GetGenomic_ins());
818         } else {
819             throw runtime_error("unhandled part type in Spliced-enon");
820         }
821     }
822 
823     CDense_seg::TLens& lens = ds->SetLens();
824     lens.reserve(product_lens.size());
825     for (unsigned int i = 0; i < product_lens.size(); ++i) {
826         lens.push_back(max(product_lens[i], genomic_lens[i]));
827     }
828     vector<TSignedSeqPos> product_starts =
829         s_CalculateStarts(product_lens, product_strand,
830                           exon.GetProduct_start().GetNucpos(),
831                           exon.GetProduct_end().GetNucpos());
832     vector<TSignedSeqPos> genomic_starts =
833         s_CalculateStarts(genomic_lens, genomic_strand,
834                           exon.GetGenomic_start(),
835                           exon.GetGenomic_end());
836 
837     CDense_seg::TStarts& starts = ds->SetStarts();
838     starts.reserve(product_starts.size() + genomic_starts.size());
839     for (unsigned int i = 0; i < lens.size(); ++i) {
840         starts.push_back(product_starts[i]);  // product row first
841         starts.push_back(genomic_starts[i]);
842     }
843     ds->SetIds().push_back(CRef<CSeq_id>(SerialClone(product_id)));
844     ds->SetIds().push_back(CRef<CSeq_id>(SerialClone(genomic_id)));
845 
846     // Set strands, unless they're both plus
847     if (!(product_strand == eNa_strand_plus
848           && genomic_strand == eNa_strand_plus)) {
849         CDense_seg::TStrands& strands = ds->SetStrands();
850         for (unsigned int i = 0; i < lens.size(); ++i) {
851             strands.push_back(product_strand);
852             strands.push_back(genomic_strand);
853         }
854     }
855 
856     ds->SetNumseg(lens.size());
857 
858     ds->Compact();  // join adjacent match/mismatch/diag parts
859     return ds;
860 }
861 
862 
s_SplicedToDisc(const CSeq_align & spliced_seg_aln)863 static CRef<CSeq_align> s_SplicedToDisc(const CSeq_align& spliced_seg_aln)
864 {
865 
866     CRef<CSeq_align> disc(new CSeq_align);
867     disc->SetType(CSeq_align::eType_disc);
868 
869     const CSpliced_seg& spliced_seg = spliced_seg_aln.GetSegs().GetSpliced();
870 
871     ENa_strand product_strand = spliced_seg.GetProduct_strand();
872     ENa_strand genomic_strand = spliced_seg.GetGenomic_strand();
873     const CSeq_id& product_id = spliced_seg.GetProduct_id();
874     const CSeq_id& genomic_id = spliced_seg.GetGenomic_id();
875 
876     //for exon in spliced_seg.GetSegs().GetSpliced().GetExons()[:] {
877     ITERATE (CSpliced_seg::TExons, exon, spliced_seg.GetExons()) {
878         CRef<CDense_seg> ds = s_ExonToDenseg(**exon,
879                                              product_strand, genomic_strand,
880                                              product_id, genomic_id);
881         CRef<CSeq_align> ds_align(new CSeq_align);
882         ds_align->SetSegs().SetDenseg(*ds);
883         ds_align->SetType(CSeq_align::eType_partial);
884         disc->SetSegs().SetDisc().Set().push_back(ds_align);
885     }
886     return disc;
887 }
888 
889 
890 END_NCBI_SCOPE
891