1 /*  $Id: gene_model.cpp 632669 2021-06-04 11:34:53Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Mike DiCuccio
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <algo/sequence/gene_model.hpp>
34 #include <corelib/ncbitime.hpp>
35 #include <objmgr/object_manager.hpp>
36 #include <objmgr/scope.hpp>
37 #include <objmgr/bioseq_handle.hpp>
38 #include <objmgr/seqdesc_ci.hpp>
39 #include <objmgr/annot_ci.hpp>
40 #include <objmgr/feat_ci.hpp>
41 #include <objmgr/align_ci.hpp>
42 #include <objmgr/seq_loc_mapper.hpp>
43 #include <objmgr/util/sequence.hpp>
44 #include <objmgr/util/feature.hpp>
45 #include <objmgr/seq_vector.hpp>
46 
47 #include <objtools/alnmgr/score_builder_base.hpp>
48 
49 #include <objects/general/Dbtag.hpp>
50 #include <objects/general/User_object.hpp>
51 #include <objects/seqalign/Dense_seg.hpp>
52 #include <objects/seqalign/Product_pos.hpp>
53 #include <objects/seqalign/Prot_pos.hpp>
54 #include <objects/seqalign/Seq_align.hpp>
55 #include <objects/seqalign/Spliced_exon_chunk.hpp>
56 #include <objects/seqalign/Spliced_exon.hpp>
57 #include <objects/seqalign/Spliced_seg.hpp>
58 #include <objects/seqalign/Spliced_seg_modifier.hpp>
59 #include <objects/seqalign/Splice_site.hpp>
60 #include <objects/seqfeat/seqfeat__.hpp>
61 #include <objects/seqfeat/SeqFeatXref.hpp>
62 #include <objects/seqfeat/Org_ref.hpp>
63 #include <objects/seqfeat/Prot_ref.hpp>
64 #include <objects/seqblock/GB_block.hpp>
65 #include <objects/seqloc/Seq_id.hpp>
66 #include <objects/seqloc/Seq_interval.hpp>
67 #include <objects/seqloc/Packed_seqint.hpp>
68 #include <objects/seqloc/Seq_loc_mix.hpp>
69 #include <objects/seqloc/Seq_loc_equiv.hpp>
70 #include <objects/seq/seq__.hpp>
71 #include <objects/seq/seqport_util.hpp>
72 #include <objects/general/Object_id.hpp>
73 #include <util/sequtil/sequtil_convert.hpp>
74 #include <util/range_coll.hpp>
75 #include <objmgr/util/seq_loc_util.hpp>
76 
77 #include "feature_generator.hpp"
78 #include <serial/serial.hpp>
79 
80 BEGIN_NCBI_SCOPE
81 USING_SCOPE(objects);
82 USING_SCOPE(sequence);
83 
84 //////////////////////////
CreateGeneModelFromAlign(const CSeq_align & align_in,CScope & scope,CSeq_annot & annot,CBioseq_set & seqs,TGeneModelCreateFlags flags,TSeqPos allowed_unaligned)85 void CGeneModel::CreateGeneModelFromAlign(const CSeq_align& align_in,
86                                           CScope& scope,
87                                           CSeq_annot& annot,
88                                           CBioseq_set& seqs,
89                                           TGeneModelCreateFlags flags,
90                                           TSeqPos allowed_unaligned)
91 {
92     CFeatureGenerator generator(scope);
93     generator.SetFlags(flags | CFeatureGenerator::fGenerateLocalIds);
94     generator.SetAllowedUnaligned(allowed_unaligned);
95 
96     CConstRef<CSeq_align> clean_align = generator.CleanAlignment(align_in);
97     generator.ConvertAlignToAnnot(*clean_align, annot, seqs);
98 }
99 
CreateGeneModelsFromAligns(const list<CRef<objects::CSeq_align>> & aligns,CScope & scope,CSeq_annot & annot,CBioseq_set & seqs,TGeneModelCreateFlags flags,TSeqPos allowed_unaligned)100 void CGeneModel::CreateGeneModelsFromAligns(const list< CRef<objects::CSeq_align> > &aligns,
101                                        CScope& scope,
102                                        CSeq_annot& annot,
103                                        CBioseq_set& seqs,
104                                        TGeneModelCreateFlags flags,
105                                        TSeqPos allowed_unaligned)
106 {
107     CFeatureGenerator generator(scope);
108     generator.SetFlags(flags | CFeatureGenerator::fGenerateLocalIds);
109     generator.SetAllowedUnaligned(allowed_unaligned);
110 
111     generator.ConvertAlignToAnnot(aligns, annot, seqs);
112 }
113 
SetFeatureExceptions(CSeq_feat & feat,CScope & scope,const CSeq_align * align)114 void CGeneModel::SetFeatureExceptions(CSeq_feat& feat,
115                                       CScope& scope,
116                                       const CSeq_align* align)
117 {
118     CFeatureGenerator generator(scope);
119     generator.SetFeatureExceptions(feat, align);
120 }
121 
122 const char* k_except_text_for_gap_filled_gnomon_model =
123     "annotated by transcript or proteomic data";
124 const char* k_rna_comment =
125     "The sequence of the model RefSeq transcript was modified relative "
126     "to this genomic sequence to represent the inferred CDS";
127 const char* k_cds_comment =
128     "The sequence of the model RefSeq protein was modified relative "
129     "to this genomic sequence to represent the inferred CDS";
130 
SetPartialFlags(CScope & scope,CRef<CSeq_feat> gene_feat,CRef<CSeq_feat> mrna_feat,CRef<CSeq_feat> cds_feat)131 void CGeneModel::SetPartialFlags(CScope& scope,
132                                  CRef<CSeq_feat> gene_feat,
133                                  CRef<CSeq_feat> mrna_feat,
134                                  CRef<CSeq_feat> cds_feat)
135 {
136     CFeatureGenerator generator(scope);
137     generator.SetPartialFlags(gene_feat, mrna_feat, cds_feat);
138 }
139 
RecomputePartialFlags(CScope & scope,CSeq_annot & annot)140 void CGeneModel::RecomputePartialFlags(CScope& scope,
141                                        CSeq_annot& annot)
142 {
143     CFeatureGenerator generator(scope);
144     generator.RecomputePartialFlags(annot);
145 }
146 
147 
148 ///
149 /// Return the mol-info object for a given sequence
150 ///
s_GetMolInfo(const CBioseq_Handle & handle)151 static const CMolInfo* s_GetMolInfo(const CBioseq_Handle& handle)
152 {
153     if (handle) {
154         CSeqdesc_CI desc_iter(handle, CSeqdesc::e_Molinfo);
155         for ( ;  desc_iter;  ++desc_iter) {
156             return &desc_iter->GetMolinfo();
157         }
158     }
159 
160     return NULL;
161 }
162 
163 /////////////////////////////////////
164 
SImplementation(CScope & scope)165 CFeatureGenerator::SImplementation::SImplementation(CScope& scope)
166     : m_scope(&scope)
167     , m_flags(fDefaults)
168     , m_min_intron(kDefaultMinIntron)
169     , m_allowed_unaligned(kDefaultAllowedUnaligned)
170     , m_is_gnomon(false)
171     , m_is_best_refseq(false)
172 {
173 }
174 
~SImplementation()175 CFeatureGenerator::SImplementation::~SImplementation()
176 {
177 }
178 
CFeatureGenerator(CRef<CScope> scope)179 CFeatureGenerator::CFeatureGenerator(CRef<CScope> scope)
180     : m_impl(new SImplementation(*scope))
181 {
182 }
183 
CFeatureGenerator(CScope & scope)184 CFeatureGenerator::CFeatureGenerator(CScope& scope)
185     : m_impl(new SImplementation(scope))
186 {
187 }
188 
~CFeatureGenerator()189 CFeatureGenerator::~CFeatureGenerator()
190 {
191 }
192 
SetFlags(TFeatureGeneratorFlags flags)193 void CFeatureGenerator::SetFlags(TFeatureGeneratorFlags flags)
194 {
195     m_impl->m_flags = flags;
196 }
197 
GetFlags() const198 CFeatureGenerator::TFeatureGeneratorFlags CFeatureGenerator::GetFlags() const
199 {
200     return m_impl->m_flags;
201 }
202 
SetMinIntron(TSeqPos value)203 void CFeatureGenerator::SetMinIntron(TSeqPos value)
204 {
205     m_impl->m_min_intron = value;
206 }
207 
SetAllowedUnaligned(TSeqPos value)208 void CFeatureGenerator::SetAllowedUnaligned(TSeqPos value)
209 {
210     m_impl->m_allowed_unaligned = value;
211 }
212 
213 CConstRef<CSeq_align>
CleanAlignment(const CSeq_align & align_in)214 CFeatureGenerator::CleanAlignment(const CSeq_align& align_in)
215 {
216     return m_impl->CleanAlignment(align_in);
217 }
218 
ConvertAlignToAnnot(const CSeq_align & align,CSeq_annot & annot,CBioseq_set & seqs,Int8 gene_id,const CSeq_feat * cdregion)219 CRef<CSeq_feat> CFeatureGenerator::ConvertAlignToAnnot(const CSeq_align& align,
220                                             CSeq_annot& annot,
221                                             CBioseq_set& seqs,
222                                             Int8 gene_id,
223                                             const CSeq_feat* cdregion)
224 {
225     return m_impl->ConvertAlignToAnnot(align, annot, seqs, gene_id, cdregion, false);
226 }
227 
ConvertAlignToAnnot(const list<CRef<CSeq_align>> & aligns,CSeq_annot & annot,CBioseq_set & seqs)228 void CFeatureGenerator::ConvertAlignToAnnot(
229                          const list< CRef<CSeq_align> > &aligns,
230                          CSeq_annot &annot,
231                          CBioseq_set &seqs)
232 {
233     m_impl->ConvertAlignToAnnot(aligns, annot, seqs);
234 }
235 
ConvertLocToAnnot(const objects::CSeq_loc & loc,objects::CSeq_annot & annot,objects::CBioseq_set & seqs,CCdregion::EFrame frame,CRef<objects::CSeq_id> prot_id,CRef<objects::CSeq_id> rna_id)236 void CFeatureGenerator::ConvertLocToAnnot(
237         const objects::CSeq_loc &loc,
238         objects::CSeq_annot& annot,
239         objects::CBioseq_set& seqs,
240         CCdregion::EFrame frame,
241         CRef<objects::CSeq_id> prot_id,
242         CRef<objects::CSeq_id> rna_id)
243 {
244     CBioseq_Handle bsh = m_impl->m_scope->GetBioseqHandle(*loc.GetId());
245     if (!bsh) {
246         NCBI_THROW(CException, eUnknown,
247             "Can't find genomic sequence " + loc.GetId()->AsFastaString());
248     }
249 
250     TFeatureGeneratorFlags old_flags = GetFlags();
251     TFeatureGeneratorFlags flags = old_flags;
252 
253     /// Temporarily change flags to make sure the needed bioseqs are generated,
254     /// and that the input ids are used
255     flags &= ~fGenerateLocalIds;
256     SetFlags(flags);
257 
258     static CAtomicCounter counter;
259     size_t new_id_num = counter.Add(1);
260     CTime time(CTime::eCurrent);
261     if (!rna_id) {
262         string str("lcl|MRNA_");
263         str += time.AsString("YMD");
264         str += "_";
265         str += NStr::NumericToString(new_id_num);
266         rna_id.Reset(new CSeq_id(str));
267     }
268     if (!prot_id) {
269         string str("lcl|PROT_");
270         str += time.AsString("YMD");
271         str += "_";
272         str += NStr::NumericToString(new_id_num);
273         prot_id.Reset(new CSeq_id(str));
274     }
275 
276     CSeq_align fake_align;
277     fake_align.SetType(CSeq_align::eType_not_set);
278     fake_align.SetDim(2);
279     fake_align.SetSegs().SetSpliced().SetProduct_id().Assign(*rna_id);
280     fake_align.SetSegs().SetSpliced().SetGenomic_id().Assign(*loc.GetId());
281     fake_align.SetSegs().SetSpliced().SetProduct_strand(eNa_strand_plus);
282     fake_align.SetSegs().SetSpliced().SetGenomic_strand(loc.GetStrand());
283     fake_align.SetSegs().SetSpliced().SetProduct_type(
284         CSpliced_seg::eProduct_type_transcript);
285 
286     TSeqPos product_pos = 0;
287     ITERATE (CSeq_loc, loc_it, loc) {
288         CRef<CSpliced_exon> exon(new CSpliced_exon);
289         exon->SetProduct_start().SetNucpos(product_pos);
290         product_pos += loc_it.GetRange().GetLength();
291         exon->SetProduct_end().SetNucpos(product_pos-1);
292         exon->SetGenomic_start(loc_it.GetRange().GetFrom());
293         exon->SetGenomic_end(loc_it.GetRange().GetTo());
294         CRef<CSpliced_exon_chunk> match(new CSpliced_exon_chunk);
295         match->SetMatch(loc_it.GetRange().GetLength());
296         exon->SetParts().push_back(match);
297         fake_align.SetSegs().SetSpliced().SetExons().push_back(exon);
298     }
299     fake_align.SetSegs().SetSpliced().SetProduct_length(product_pos);
300 
301     CSeq_feat cdregion;
302     cdregion.SetData().SetCdregion().SetFrame(frame);
303     if (frame != CCdregion::eFrame_one &&
304         !loc.IsPartialStart(eExtreme_Biological))
305     {
306         NCBI_THROW(CException, eUnknown,
307             "Non-standard frame specified with 5'-complete location");
308     }
309 
310     CSeq_loc cdregion_loc(*rna_id, 0, product_pos-1, eNa_strand_plus);
311     if (loc.IsPartialStart(eExtreme_Biological)) {
312         cdregion_loc.SetPartialStart(true, eExtreme_Biological);
313     }
314     if (loc.IsPartialStop(eExtreme_Biological)) {
315         cdregion_loc.SetPartialStop(true, eExtreme_Biological);
316     } else if (flags & fCreateCdregion) {
317         /// location is 3'-complete; verify we have a whole number of codons,
318         /// taking frame into account
319         switch (frame) {
320         case CCdregion::eFrame_two:
321             product_pos -= 1;
322             break;
323 
324         case CCdregion::eFrame_three:
325             product_pos -= 2;
326             break;
327         default:
328             break;
329         }
330 
331         if (product_pos % 3) {
332             NCBI_THROW(CException, eUnknown,
333                 "Non-whole number of codons with 3'-complete location");
334         }
335     }
336 
337     const COrg_ref* org = sequence::GetOrg_refOrNull(bsh);
338     if (org) {
339         CRef<CGenetic_code::C_E> code(new CGenetic_code::C_E);
340         code->SetId(fg::GetGeneticCode(bsh));
341         cdregion.SetData().SetCdregion().SetCode().Set().push_back(code);
342     }
343 
344     cdregion.SetLocation().Assign(cdregion_loc);
345     cdregion.SetProduct().SetWhole(*prot_id);
346 
347     m_impl->ConvertAlignToAnnot(fake_align, annot, seqs, 0, &cdregion, false);
348 
349     /// Restore old flags
350     SetFlags(old_flags);
351 }
352 
SetFeatureExceptions(CSeq_feat & feat,const CSeq_align * align)353 void CFeatureGenerator::SetFeatureExceptions(CSeq_feat& feat,
354                                              const CSeq_align* align)
355 {
356     m_impl->SetFeatureExceptions(feat, align);
357 }
358 
359 
SetPartialFlags(CRef<CSeq_feat> gene_feat,CRef<CSeq_feat> mrna_feat,CRef<CSeq_feat> cds_feat)360 void CFeatureGenerator::SetPartialFlags(CRef<CSeq_feat> gene_feat,
361                                         CRef<CSeq_feat> mrna_feat,
362                                         CRef<CSeq_feat> cds_feat)
363 {
364     m_impl->SetPartialFlags(gene_feat, mrna_feat, cds_feat);
365 }
366 
RecomputePartialFlags(CSeq_annot & annot)367 void CFeatureGenerator::RecomputePartialFlags(CSeq_annot& annot)
368 {
369     m_impl->RecomputePartialFlags(annot);
370 }
371 
372 
SMapper(const CSeq_align & aln,CScope & scope,TSeqPos allowed_unaligned,CSeq_loc_Mapper::TMapOptions opts)373 CFeatureGenerator::SImplementation::SMapper::SMapper(const CSeq_align& aln, CScope& scope,
374                                                      TSeqPos allowed_unaligned,
375                                                      CSeq_loc_Mapper::TMapOptions opts)
376     : m_aln(aln), m_scope(scope), m_genomic_row(-1)
377     , m_allowed_unaligned(allowed_unaligned), m_opts(opts)
378 {
379     if(aln.GetSegs().IsSpliced()) {
380         //row 1 is always genomic in spliced-segs
381         m_genomic_row = 1;
382     } else {
383         //otherwise, find exactly one genomic row
384         CSeq_align::TDim num_rows = aln.CheckNumRows();
385         if (num_rows != 2) {
386             /// make sure we only have two rows. anything else
387             /// represents a mixed-strand case or more than two
388             /// sequences
389             NCBI_THROW(CException, eUnknown,
390                        "CreateGeneModelFromAlign(): "
391                        "failed to create consistent alignment");
392         }
393         for (CSeq_align::TDim i = 0;  i < num_rows;  ++i) {
394             const CSeq_id& id = aln.GetSeq_id(i);
395             CBioseq_Handle handle = scope.GetBioseqHandle(id);
396             if(!handle) {
397                 continue;
398             }
399             const CMolInfo* info =  sequence::GetMolInfo(handle);
400             if (info && info->IsSetBiomol()
401                 && info->GetBiomol() == CMolInfo::eBiomol_genomic)
402                 {
403                     if(m_genomic_row < 0) {
404                         m_genomic_row = i;
405                     } else {
406                         NCBI_THROW(CException, eUnknown,
407                                    "CreateGeneModelFromAlign(): "
408                                    "More than one genomic row in alignment");
409                     }
410                 }
411         }
412         if (m_genomic_row < 0) {
413             NCBI_THROW(CException, eUnknown,
414                        "CreateGeneModelFromAlign(): "
415                        "No genomic sequence found in alignment");
416         }
417     }
418 }
419 
GetRnaLoc()420 const CSeq_loc& CFeatureGenerator::SImplementation::SMapper::GetRnaLoc()
421 {
422     if(rna_loc.IsNull()) {
423         if(m_aln.GetSegs().IsSpliced()) {
424             rna_loc = x_GetLocFromSplicedExons(m_aln);
425         } else {
426             const CSeq_id& id = m_aln.GetSeq_id(GetRnaRow());
427             CBioseq_Handle handle = m_scope.GetBioseqHandle(id);
428             CRef<CSeq_loc> range_loc =
429                 handle.GetRangeSeq_loc(0, 0, eNa_strand_plus); //0-0 meanns whole range
430             //todo: truncate the range loc not to include polyA, or
431             //else the remapped loc will be erroneously partial
432             //not a huge issue as it only applies to seg alignments only.
433             rna_loc = x_Mapper()->Map(*range_loc);
434         }
435     }
436     return *rna_loc;
437 }
438 
GetGenomicRow() const439 CSeq_align::TDim CFeatureGenerator::SImplementation::SMapper::GetGenomicRow() const
440 {
441     return m_genomic_row;
442 }
443 
GetRnaRow() const444 CSeq_align::TDim CFeatureGenerator::SImplementation::SMapper::GetRnaRow() const
445 {
446     //we checked that alignment contains exactly 2 rows
447     return GetGenomicRow() == 0 ? 1 : 0;
448 }
449 
Map(const CSeq_loc & loc)450 CRef<CSeq_loc> CFeatureGenerator::SImplementation::SMapper::Map(const CSeq_loc& loc)
451 {
452     CRef<CSeq_loc> mapped_loc  = x_Mapper()->Map(loc);
453     return mapped_loc;
454 }
455 
IncludeSourceLocs(bool b)456 void CFeatureGenerator::SImplementation::SMapper::IncludeSourceLocs(bool b)
457 {
458     x_Mapper()->IncludeSourceLocs(b);
459 }
460 
SetMergeNone()461 void CFeatureGenerator::SImplementation::SMapper::SetMergeNone()
462 {
463     x_Mapper()->SetMergeNone();
464 }
465 
x_GetLocFromSplicedExons(const CSeq_align & aln) const466 CRef<CSeq_loc> CFeatureGenerator::SImplementation::SMapper::x_GetLocFromSplicedExons(const CSeq_align& aln) const
467 {
468     CRef<CSeq_loc> loc(new CSeq_loc);
469     CConstRef<CSpliced_exon> prev_exon;
470     CRef<CSeq_interval> prev_int;
471 
472     const CSpliced_seg& spliced_seg = aln.GetSegs().GetSpliced();
473     TSeqPos genomic_size = m_scope.GetSequenceLength(spliced_seg.GetGenomic_id());
474     ITERATE(CSpliced_seg::TExons, it, spliced_seg.GetExons()) {
475         const CSpliced_exon& exon = **it;
476         CRef<CSeq_interval> genomic_int(new CSeq_interval);
477 
478         genomic_int->SetId().Assign(aln.GetSeq_id(1));
479         genomic_int->SetFrom(exon.GetGenomic_start());
480         genomic_int->SetTo(exon.GetGenomic_end());
481         genomic_int->SetStrand(
482                                exon.IsSetGenomic_strand() ? exon.GetGenomic_strand()
483                                : spliced_seg.IsSetGenomic_strand() ? spliced_seg.GetGenomic_strand()
484                                : eNa_strand_plus);
485 
486         // check for gaps between exons
487         if(!prev_exon.IsNull() &&
488            !(prev_exon->GetProduct_end().GetNucpos() + 1 == exon.GetProduct_start().GetNucpos() &&
489              ((genomic_int->GetStrand()!=eNa_strand_minus && prev_exon->GetGenomic_end()==genomic_size-1 && exon.GetGenomic_start()==0) ||
490               (genomic_int->GetStrand()==eNa_strand_minus && exon.GetGenomic_end()==genomic_size-1 && prev_exon->GetGenomic_start()==0))
491              )) {
492 
493             bool donor_set = prev_exon->IsSetDonor_after_exon();
494             bool acceptor_set = exon.IsSetAcceptor_before_exon();
495 
496             if(!(donor_set && acceptor_set) || prev_exon->GetProduct_end().GetNucpos() + 1 != exon.GetProduct_start().GetNucpos()) {
497                 // gap between exons on rna. But which exon is partial?
498                 // if have non-strict consensus splice site - blame it
499                 // for partialness. If can't disambiguate on this - set
500                 // both.
501                 bool donor_ok =
502                     (donor_set  &&
503                      prev_exon->GetDonor_after_exon().GetBases() == "GT");
504                 bool acceptor_ok =
505                     (acceptor_set  &&
506                      exon.GetAcceptor_before_exon().GetBases() == "AG");
507                 if(donor_ok || !acceptor_ok) {
508                     genomic_int->SetPartialStart(true, eExtreme_Biological);
509                 }
510                 if(acceptor_ok || !donor_ok) {
511                     prev_int->SetPartialStop(true, eExtreme_Biological);
512                 }
513             }
514         }
515 
516         loc->SetPacked_int().Set().push_back(genomic_int);
517 
518         prev_exon = *it;
519         prev_int = genomic_int;
520     }
521 
522     // set terminal partialness
523     if(m_aln.GetSeqStart(0) > m_allowed_unaligned) {
524         loc->SetPartialStart(true, eExtreme_Biological);
525     }
526 
527     TSeqPos product_len = aln.GetSegs().GetSpliced().GetProduct_length();
528     TSeqPos polya_pos = aln.GetSegs().GetSpliced().CanGetPoly_a() ? aln.GetSegs().GetSpliced().GetPoly_a() : product_len;
529 
530     if(m_aln.GetSeqStop(0) + 1 + m_allowed_unaligned < polya_pos) {
531         loc->SetPartialStop(true, eExtreme_Biological);
532     }
533     return loc;
534 
535 }
536 
x_Mapper()537 CRef<CSeq_loc_Mapper> CFeatureGenerator::SImplementation::SMapper::x_Mapper()
538 {
539     if (!m_mapper) {
540         m_mapper.Reset
541             (new CSeq_loc_Mapper(m_aln, m_aln.GetSeq_id(m_genomic_row),
542                                  &m_scope, m_opts));
543     }
544     return m_mapper;
545 }
546 
547 CConstRef<CSeq_align>
548 CFeatureGenerator::
CleanAlignment(const CSeq_align & align_in)549 SImplementation::CleanAlignment(const CSeq_align& align_in)
550 {
551     if (!align_in.CanGetSegs() || !align_in.GetSegs().IsSpliced())
552         return CConstRef<CSeq_align>(&align_in);
553 
554     CRef<CSeq_align> align(new CSeq_align);
555     align->Assign(align_in);
556 
557     vector<SExon> orig_exons = GetExons(*align);
558 
559     StitchSmallHoles(*align);
560     TrimHolesToCodons(*align);
561 
562     if (m_flags & fMaximizeTranslation) {
563         MaximizeTranslation(*align);
564     }
565 
566     if (GetExons(*align) != orig_exons) {
567         if (m_flags & fMaximizeTranslation) {
568             ClearScores(*align);
569         } else {
570             RecalculateScores(*align);
571         }
572     }
573 
574     return align;
575 }
576 
s_TransformToNucpos(CProduct_pos & pos)577 static void s_TransformToNucpos(CProduct_pos &pos)
578 {
579     TSeqPos nucpos = pos.AsSeqPos();
580     pos.SetNucpos(nucpos);
581 }
582 
ExtractGnomonModelNum(const CSeq_id & seq_id)583 string ExtractGnomonModelNum(const CSeq_id& seq_id)
584 {
585     string model_num;
586     if (seq_id.IsGeneral() && seq_id.GetGeneral().CanGetDb() &&
587             NStr::EqualNocase(seq_id.GetGeneral().GetDb(), "GNOMON")) {
588         model_num = seq_id.GetGeneral().GetTag().GetStr();
589         model_num.erase(model_num.size()-2, 2);
590     }
591     return model_num;
592 }
593 
IsProteinAlign(const CSeq_align & align)594 bool IsProteinAlign(const CSeq_align& align)
595 {
596     return align.CanGetSegs() && align.GetSegs().IsSpliced()
597         && align.GetSegs().GetSpliced().GetProduct_type()
598         == CSpliced_seg::eProduct_type_protein;
599 }
600 
601 void CFeatureGenerator::
TransformProteinAlignToTranscript(CConstRef<CSeq_align> & align,CRef<CSeq_feat> & cd_feat)602 SImplementation::TransformProteinAlignToTranscript(CConstRef<CSeq_align>& align, CRef<CSeq_feat>& cd_feat)
603 {
604         /// This is a protein alignment; transform it into a fake transcript alignment
605         /// so the rest of the processing can go on
606         bool found_start_codon = false;
607         bool found_stop_codon = false;
608         ITERATE (CSpliced_seg::TModifiers, mod_it,
609                  align->GetSegs().GetSpliced().GetModifiers()) {
610             if ((*mod_it)->IsStart_codon_found()) {
611                 found_start_codon = (*mod_it)->GetStart_codon_found();
612             }
613             if ((*mod_it)->IsStop_codon_found()) {
614                 found_stop_codon = (*mod_it)->GetStop_codon_found();
615             }
616         }
617 
618 
619         CBioseq_Handle bsh = m_scope->GetBioseqHandle(align->GetSeq_id(1));
620         if (!bsh) {
621             NCBI_THROW(CException, eUnknown,
622                 "Can't find genomic sequence " +
623                     align->GetSeq_id(1).AsFastaString());
624         }
625 
626         CSeq_align *fake_transcript_align = new CSeq_align;
627         fake_transcript_align->Assign(*align);
628         align.Reset(fake_transcript_align);
629 
630         CRef<CSeq_id> prot_id(new CSeq_id);
631         prot_id->Assign(fake_transcript_align->GetSeq_id(0));
632 
633         {
634             /// for the mRna we have to
635             /// create a local id, since the id we have in the alignment is a
636             /// protein id
637             static CAtomicCounter counter;
638             size_t new_id_num = counter.Add(1);
639             CTime time(CTime::eCurrent);
640             string str("lcl|MRNA_");
641             if ((m_flags & fGenerateStableLocalIds) == 0) {
642                 str += time.AsString("YMD");
643                 str += "_";
644             }
645             str += NStr::NumericToString(new_id_num);
646             CRef<CSeq_id> fake_rna_id(new CSeq_id(str));
647             fake_transcript_align->SetSegs().SetSpliced().SetProduct_id(
648                 *fake_rna_id);
649         }
650         fake_transcript_align->SetSegs().SetSpliced().SetProduct_type(
651             CSpliced_seg::eProduct_type_transcript);
652         NON_CONST_ITERATE (CSpliced_seg::TExons, exon_it,
653                       fake_transcript_align->SetSegs().SetSpliced().SetExons())
654         {
655             s_TransformToNucpos((*exon_it)->SetProduct_start());
656             s_TransformToNucpos((*exon_it)->SetProduct_end());
657         }
658 
659         CRef<CSpliced_exon> last_exon =
660             fake_transcript_align->SetSegs().SetSpliced().SetExons().back();
661         bool aligned_to_the_end =
662             last_exon->GetProduct_end().GetNucpos()+1==
663             fake_transcript_align->GetSegs().GetSpliced().GetProduct_length()*3;
664 
665         fake_transcript_align->SetSegs().SetSpliced().SetProduct_length() =
666             fake_transcript_align->GetSegs().GetSpliced().GetProduct_length()*3 +
667             (((found_stop_codon && aligned_to_the_end) || !aligned_to_the_end)?3:0);
668 
669         if (found_stop_codon && aligned_to_the_end) {
670             bool is_minus = last_exon->IsSetGenomic_strand() ?
671                     last_exon->GetGenomic_strand() == eNa_strand_minus :
672                     (fake_transcript_align->GetSegs().GetSpliced()
673                         . IsSetGenomic_strand() &&
674                      fake_transcript_align->GetSegs().GetSpliced()
675                         . GetGenomic_strand() == eNa_strand_minus);
676 
677             TSeqPos genomic_length = bsh.GetBioseqLength();
678             TSeqPos space_for_codon = min(3u, is_minus
679                 ? last_exon->GetGenomic_start()
680                 : genomic_length - last_exon->GetGenomic_end() - 1);
681             if (space_for_codon < 3) {
682                 if (bsh.GetInst_Topology() != CSeq_inst::eTopology_circular) {
683                     NCBI_THROW(CException, eUnknown,
684                                "Stop codon goes outside genomic sequence");
685                 }
686                 CRef<CSpliced_exon> new_exon(new CSpliced_exon);
687                 new_exon->SetProduct_start().SetNucpos(
688                     last_exon->GetProduct_end().GetNucpos() + space_for_codon + 1);
689                 new_exon->SetProduct_end().SetNucpos(
690                     last_exon->GetProduct_end().GetNucpos() + 3);
691                 new_exon->SetGenomic_start(
692                     is_minus ? genomic_length - 3 + space_for_codon : 0);
693                 new_exon->SetGenomic_end(
694                     is_minus ? genomic_length - 1 : 2 - space_for_codon);
695                 if (last_exon->IsSetProduct_strand()) {
696                     new_exon->SetProduct_strand(last_exon->GetProduct_strand());
697                 }
698                 if (last_exon->IsSetGenomic_strand()) {
699                     new_exon->SetGenomic_strand(last_exon->GetGenomic_strand());
700                 }
701                 fake_transcript_align->SetSegs().SetSpliced().SetExons()
702                     . push_back(new_exon);
703             }
704 
705             /// Extend last exon to include whatever part of stop codon fits
706             last_exon->SetProduct_end().SetNucpos() += space_for_codon;
707             if (is_minus) {
708                 last_exon->SetGenomic_start() -= space_for_codon;
709             } else {
710                 last_exon->SetGenomic_end() += space_for_codon;
711             }
712             if (last_exon->IsSetParts() && space_for_codon) {
713                 CRef<CSpliced_exon_chunk> match_stop_codon
714                     (new CSpliced_exon_chunk);
715                 match_stop_codon->SetMatch(space_for_codon);
716                 last_exon->SetParts().push_back(match_stop_codon);
717             }
718         }
719 
720         cd_feat.Reset(new CSeq_feat);
721         cd_feat->SetData().SetCdregion();
722 
723         CRef<CSeq_loc> cds_on_fake_mrna_loc(new CSeq_loc(
724             fake_transcript_align->SetSegs().SetSpliced().SetProduct_id(),
725             0, fake_transcript_align->GetSegs().GetSpliced().GetProduct_length()-1));
726         if (!found_start_codon &&
727             fake_transcript_align->SetSegs().SetSpliced().SetExons().front()->GetProduct_start().GetNucpos()==0) {
728             cds_on_fake_mrna_loc->SetPartialStart(true, eExtreme_Biological);
729         }
730         if (!found_stop_codon && aligned_to_the_end) {
731             cds_on_fake_mrna_loc->SetPartialStop(true, eExtreme_Biological);
732         }
733         cd_feat->SetLocation(*cds_on_fake_mrna_loc);
734 
735 	const COrg_ref *org = sequence::GetOrg_refOrNull(bsh);
736 	if (org) {
737             CRef<CGenetic_code::C_E> code(new CGenetic_code::C_E);
738             code->SetId(fg::GetGeneticCode(bsh));
739             cd_feat->SetData().SetCdregion().SetCode().Set().push_back(code);
740         }
741 
742         cd_feat->SetProduct().SetWhole(*prot_id);
743 
744 }
745 
RenameGeneratedBioseqs(const CSeq_id & query_rna_id,CSeq_id & transcribed_rna_id,CRef<CSeq_feat> cds_feat_on_query_mrna,CRef<CSeq_feat> cds_feat_on_genome_with_translated_product)746 void  RenameGeneratedBioseqs(const CSeq_id& query_rna_id, CSeq_id& transcribed_rna_id,
747                              CRef<CSeq_feat> cds_feat_on_query_mrna,
748                              CRef<CSeq_feat> cds_feat_on_genome_with_translated_product)
749 {
750     transcribed_rna_id.Assign(query_rna_id);
751     if (cds_feat_on_genome_with_translated_product &&
752         cds_feat_on_genome_with_translated_product->CanGetProduct() &&
753         cds_feat_on_query_mrna &&
754         cds_feat_on_query_mrna->CanGetProduct()) {
755         CSeq_id* translated_protein_id = const_cast<CSeq_id*>(cds_feat_on_genome_with_translated_product->SetProduct().GetId());
756         translated_protein_id->Assign(*cds_feat_on_query_mrna->GetProduct().GetId());
757     }
758 }
759 
760 CRef<CSeq_feat>
761 CFeatureGenerator::
ConvertAlignToAnnot(const CSeq_align & input_align,CSeq_annot & annot,CBioseq_set & seqs,Int8 gene_id,const CSeq_feat * cds_feat_on_query_mrna_ptr,bool call_on_align_list)762 SImplementation::ConvertAlignToAnnot(const CSeq_align& input_align,
763                                      CSeq_annot& annot,
764                                      CBioseq_set& seqs,
765                                      Int8 gene_id,
766                                      const CSeq_feat* cds_feat_on_query_mrna_ptr,
767                                      bool call_on_align_list)
768 {
769     if (HasMixedGenomicIds(input_align)) {
770         return ConvertMixedAlignToAnnot(input_align, annot, seqs, gene_id, cds_feat_on_query_mrna_ptr,
771                                         call_on_align_list);
772     }
773 
774     CConstRef<CSeq_align> align(&input_align);
775     CRef<CSeq_feat> cds_feat_on_query_mrna;
776     bool is_protein_align = IsProteinAlign(*align);
777     if (is_protein_align) {
778         TransformProteinAlignToTranscript(align, cds_feat_on_query_mrna);
779     }
780 
781     CSeq_loc_Mapper::TMapOptions opts = 0;
782     if (m_flags & fDensegAsExon) {
783         opts |= CSeq_loc_Mapper::fAlign_Dense_seg_TotalRange;
784     }
785 
786     SMapper mapper(*align, *m_scope, m_allowed_unaligned, opts);
787 
788     const CSeq_id& query_rna_id = align->GetSeq_id(mapper.GetRnaRow());
789 
790     if (!ExtractGnomonModelNum(query_rna_id).empty()) {
791         m_is_gnomon = true;
792     } else {
793         CSeq_id_Handle best_id;
794         if (!(m_flags & fDeNovoProducts)) {
795             best_id = sequence::GetId(query_rna_id, *m_scope,
796                                       sequence::eGetId_Best);
797         }
798         CSeq_id::EAccessionInfo rna_acc_info =
799             best_id ? best_id.IdentifyAccession() : query_rna_id.IdentifyAccession();
800         m_is_best_refseq = rna_acc_info == CSeq_id::eAcc_refseq_mrna ||
801                            rna_acc_info == CSeq_id::eAcc_refseq_ncrna;
802     }
803 
804 
805     if (cds_feat_on_query_mrna_ptr) {
806         cds_feat_on_query_mrna.Reset(new CSeq_feat);
807         cds_feat_on_query_mrna->Assign(*cds_feat_on_query_mrna_ptr);
808     } else if (!is_protein_align && !(m_flags & fDeNovoProducts)) {
809         CMappedFeat cdregion_handle = GetCdsOnMrna(query_rna_id, *m_scope);
810         if (cdregion_handle) {
811             cds_feat_on_query_mrna.Reset(new CSeq_feat);
812             cds_feat_on_query_mrna->Assign(cdregion_handle.GetMappedFeature());
813         }
814     }
815 
816     CMappedFeat full_length_rna;
817     vector<CMappedFeat> ncRNAs;
818 
819     CBioseq_Handle query_rna_handle = m_scope->GetBioseqHandle(query_rna_id);
820     if (query_rna_handle) {
821         for (CFeat_CI feat_iter(query_rna_handle, CSeqFeatData::e_Rna);
822              feat_iter;  ++feat_iter) {
823             const CSeq_loc &rna_loc = feat_iter->GetLocation();
824             if (feat_iter->GetData().GetSubtype() !=
825                            CSeqFeatData::eSubtype_mRNA &&
826                     ++rna_loc.begin() == rna_loc.end() &&
827                     rna_loc.GetTotalRange().GetLength() ==
828                         query_rna_handle.GetBioseqLength())
829             {
830                 full_length_rna = *feat_iter;
831             } else if (feat_iter->GetData().GetSubtype() ==
832                            CSeqFeatData::eSubtype_ncRNA)
833             {
834                 ncRNAs.push_back(*feat_iter);
835             }
836         }
837     }
838 
839     CTime time(CTime::eCurrent);
840     static CAtomicCounter counter;
841     size_t model_num = counter.Add(1);
842 
843     /// we always need the mRNA location as a reference
844     CRef<CSeq_loc> rna_feat_loc_on_genome(new CSeq_loc);
845     rna_feat_loc_on_genome->Assign(mapper.GetRnaLoc());
846 
847     CRef<CSeq_feat> cds_feat_on_transcribed_mrna;
848     list<CRef<CSeq_loc> > transcribed_mrna_seqloc_refs;
849 
850     /// create a new bioseq for this mRNA; if the mRNA sequence is not found,
851     /// this is needed in order to translate the protein
852     /// alignment, even if flag fForceTranscribeMrna wasn't set
853     CRef<CSeq_id> transcribed_rna_id =
854         x_CreateMrnaBioseq(*align, rna_feat_loc_on_genome, time,
855                            model_num, seqs,
856                            cds_feat_on_query_mrna, cds_feat_on_transcribed_mrna);
857 
858     CRef<CSeq_feat> mrna_feat_on_genome_with_translated_product =
859         full_length_rna && (m_flags&fPropagateNcrnaFeats)
860         /// If there is a full-length RNA feature, propagate it instead of
861         /// creating a new one. Create the bioseq separately
862         ? x_CreateNcRnaFeature(&full_length_rna.GetOriginalFeature(),
863                                *align, rna_feat_loc_on_genome, opts)
864         : x_CreateMrnaFeature(rna_feat_loc_on_genome, query_rna_id,
865                               *transcribed_rna_id, cds_feat_on_query_mrna);
866     if (mrna_feat_on_genome_with_translated_product &&
867         !mrna_feat_on_genome_with_translated_product->IsSetProduct()) {
868         /// Propagated full-length feature; add product
869         mrna_feat_on_genome_with_translated_product->
870             SetProduct().SetWhole().Assign(*transcribed_rna_id);
871     }
872 
873     CRef<CSeq_feat> cds_feat_on_genome_with_translated_product =
874         x_CreateCdsFeature(cds_feat_on_query_mrna, cds_feat_on_transcribed_mrna,
875                            transcribed_mrna_seqloc_refs,
876                            *align, rna_feat_loc_on_genome, time, model_num, seqs, opts);
877 
878     const CSeq_id& genomic_id = align->GetSeq_id(mapper.GetGenomicRow());
879     if (m_is_best_refseq && mrna_feat_on_genome_with_translated_product) {
880         CSeq_id_Handle genomic_acc = sequence::GetId(genomic_id, *m_scope,
881                                                      sequence::eGetId_ForceAcc);
882         if (genomic_acc) {
883             x_AddSelectMarkup(*align, query_rna_handle, *genomic_acc.GetSeqId(),
884                               *mrna_feat_on_genome_with_translated_product,
885                               cds_feat_on_genome_with_translated_product.GetPointer());
886         }
887     }
888 
889     CRef<CSeq_feat> gene_feat;
890 
891     if(!call_on_align_list){
892         if (gene_id) {
893             TGeneMap::iterator gene = genes.find(gene_id);
894             if (gene == genes.end()) {
895                 x_CreateGeneFeature(gene_feat, query_rna_handle, mapper,
896                                     rna_feat_loc_on_genome, genomic_id, gene_id);
897                 if (gene_feat) {
898                     _ASSERT(gene_feat->GetData().Which() !=
899                             CSeqFeatData::e_not_set);
900                     annot.SetData().SetFtable().push_back(gene_feat);
901                 }
902                 gene = genes.insert(make_pair(gene_id,gene_feat)).first;
903             } else {
904                 gene_feat = gene->second;
905                 gene_feat->SetLocation(*MergeSeq_locs(&gene_feat->GetLocation(),
906                                                       &mrna_feat_on_genome_with_translated_product->GetLocation()));
907             }
908 
909             CRef< CSeqFeatXref > genexref( new CSeqFeatXref() );
910             genexref->SetId(*gene_feat->SetIds().front());
911 
912             CRef< CSeqFeatXref > mrnaxref( new CSeqFeatXref() );
913             mrnaxref->SetId(*mrna_feat_on_genome_with_translated_product->SetIds().front());
914 
915             gene_feat->SetXref().push_back(mrnaxref);
916             mrna_feat_on_genome_with_translated_product->SetXref().push_back(genexref);
917 
918         } else {
919             x_CreateGeneFeature(gene_feat, query_rna_handle, mapper,
920                                 rna_feat_loc_on_genome, genomic_id);
921             if (gene_feat) {
922                 _ASSERT(gene_feat->GetData().Which() != CSeqFeatData::e_not_set);
923                 annot.SetData().SetFtable().push_back(gene_feat);
924             }
925         }
926     }
927 
928     if (mrna_feat_on_genome_with_translated_product) {
929         _ASSERT(mrna_feat_on_genome_with_translated_product->GetData().Which() != CSeqFeatData::e_not_set);
930 
931         annot.SetData().SetFtable().push_back(mrna_feat_on_genome_with_translated_product); // NOTE: added after gene!
932     }
933 
934     CSeq_annot::C_Data::TFtable propagated_features;
935 
936     if(cds_feat_on_genome_with_translated_product.NotNull()) {
937         propagated_features.push_back(cds_feat_on_genome_with_translated_product);
938 
939         if (cds_feat_on_query_mrna && cds_feat_on_query_mrna->CanGetProduct()) {
940             CBioseq_Handle prot_handle =
941                 m_scope->GetBioseqHandle(*cds_feat_on_query_mrna->GetProduct().GetId());
942             if (prot_handle) {
943                 for (CFeat_CI feat_iter(prot_handle,
944                                         CSeqFeatData::e_Prot);
945                      feat_iter; ++feat_iter) {
946                     const CProt_ref &prot_ref =
947                         feat_iter->GetData().GetProt();
948                     if (prot_ref.IsSetName() &&
949                         !prot_ref.GetName().empty()) {
950                         CRef< CSeqFeatXref > prot_xref(
951                                                        new CSeqFeatXref());
952                         prot_xref->SetData().SetProt().SetName()
953                             . push_back(prot_ref.GetName().front());
954                         cds_feat_on_genome_with_translated_product->SetXref().push_back(prot_xref);
955                         break;
956                     }
957                 }
958             }
959         }
960     }
961 
962     ITERATE(vector<CMappedFeat>, it, ncRNAs){
963         CRef<CSeq_feat> ncrna_feat =
964             x_CreateNcRnaFeature(&it->GetOriginalFeature(), *align, rna_feat_loc_on_genome, opts);
965         if(ncrna_feat)
966             propagated_features.push_back(ncrna_feat);
967     }
968 
969     NON_CONST_ITERATE(CSeq_annot::C_Data::TFtable, it, propagated_features){
970         _ASSERT((*it)->GetData().Which() != CSeqFeatData::e_not_set);
971         annot.SetData().SetFtable().push_back(*it);
972 
973         if (m_is_gnomon) {  // create xrefs for gnomon models
974             CRef< CSeqFeatXref > propagatedxref( new CSeqFeatXref() );
975             if ((*it)->IsSetIds()) {
976                 propagatedxref->SetId(*(*it)->SetIds().front());
977             }
978 
979             CRef< CSeqFeatXref > mrnaxref( new CSeqFeatXref() );
980             mrnaxref->SetId(*mrna_feat_on_genome_with_translated_product->SetIds().front());
981 
982             (*it)->SetXref().push_back(mrnaxref);
983             mrna_feat_on_genome_with_translated_product->SetXref().push_back(propagatedxref);
984         }
985     }
986 
987     if(!call_on_align_list){
988         if(propagated_features.empty()){
989             SetPartialFlags(gene_feat, mrna_feat_on_genome_with_translated_product, CRef<CSeq_feat>());
990         }
991         ITERATE(CSeq_annot::C_Data::TFtable, it, propagated_features){
992             x_CheckInconsistentDbxrefs(gene_feat, *it);
993             SetPartialFlags(gene_feat, mrna_feat_on_genome_with_translated_product, *it);
994         }
995         x_CopyAdditionalFeatures(query_rna_handle, mapper, annot);
996     }
997 
998     if (!(m_flags & fGenerateLocalIds)) {
999         if (mrna_feat_on_genome_with_translated_product) {
1000             mrna_feat_on_genome_with_translated_product->SetProduct().SetWhole().Assign(query_rna_id);
1001         }
1002         if (cds_feat_on_genome_with_translated_product) {
1003             if (cds_feat_on_query_mrna->CanGetProduct()) {
1004                 cds_feat_on_genome_with_translated_product->
1005                     SetProduct().Assign(cds_feat_on_query_mrna->GetProduct());
1006                 cds_feat_on_transcribed_mrna->
1007                     SetProduct().Assign(cds_feat_on_query_mrna->GetProduct());
1008             }
1009             CRef<CSeq_id> seq_id(new CSeq_id);
1010             seq_id->Assign(query_rna_id);
1011             cds_feat_on_transcribed_mrna->SetLocation().SetId(*seq_id);
1012             NON_CONST_ITERATE (list<CRef<CSeq_loc> >, loc, transcribed_mrna_seqloc_refs) {
1013                 (*loc)->SetId(*seq_id);
1014             }
1015         }
1016 
1017         // rename generated bioseqs if query bioseqs do not exist
1018         if (!query_rna_handle) {
1019             RenameGeneratedBioseqs(query_rna_id, *transcribed_rna_id,
1020                                    cds_feat_on_query_mrna, cds_feat_on_genome_with_translated_product);
1021         }
1022    }
1023 
1024     if (mrna_feat_on_genome_with_translated_product) {
1025         CBioseq_Handle rna_handle =
1026             m_scope->GetBioseqHandle(query_rna_id);
1027         CSeq_entry_Handle rna_seh;
1028         if (!rna_handle) {
1029             rna_seh = m_scope->AddTopLevelSeqEntry(*seqs.SetSeq_set().front());
1030         }
1031 
1032         SetFeatureExceptions(*mrna_feat_on_genome_with_translated_product, align,
1033                              cds_feat_on_genome_with_translated_product.GetPointer(),
1034                              cds_feat_on_query_mrna.GetPointer(),
1035                              cds_feat_on_transcribed_mrna.GetPointer());
1036 
1037         if (rna_seh) {
1038             m_scope->RemoveTopLevelSeqEntry(rna_seh);
1039         }
1040     }
1041     if (cds_feat_on_genome_with_translated_product) {
1042         CBioseq_Handle prot_handle =
1043             m_scope->GetBioseqHandle(*cds_feat_on_genome_with_translated_product->GetProduct().GetId());
1044         CSeq_entry_Handle prot_seh;
1045         if (!prot_handle) {
1046             prot_seh = m_scope->AddTopLevelSeqEntry(*seqs.SetSeq_set().back());
1047         }
1048 
1049         TSeqPos clean_match_count = 0;
1050         SetFeatureExceptions(*cds_feat_on_genome_with_translated_product, align, NULL,
1051                              cds_feat_on_query_mrna.GetPointer(),
1052                              cds_feat_on_transcribed_mrna.GetPointer(),
1053                              &transcribed_mrna_seqloc_refs,
1054                              &clean_match_count);
1055         if (!clean_match_count) {
1056             /// Not even one base matched cleanly; remove feature
1057             annot.SetData().SetFtable().remove(cds_feat_on_genome_with_translated_product);
1058             cds_feat_on_genome_with_translated_product = NULL;
1059         }
1060         if (prot_seh) {
1061             m_scope->RemoveTopLevelSeqEntry(prot_seh);
1062         }
1063     }
1064 
1065     if (!(m_flags & fGenerateLocalIds)) {
1066         RenameGeneratedBioseqs(query_rna_id, *transcribed_rna_id, cds_feat_on_query_mrna, cds_feat_on_genome_with_translated_product);
1067     }
1068     if (m_is_gnomon) {
1069         // add generated bioseqs to the scope
1070         ITERATE(CBioseq_set::TSeq_set, it, seqs.SetSeq_set()) {
1071             m_scope->AddTopLevelSeqEntry(**it);
1072         }
1073     }
1074 
1075     if (!(m_flags & fForceTranscribeMrna) ||
1076         !(m_flags & fForceTranslateCds))
1077     {
1078         /// We created Bioseqs the user didn't ask for,
1079         /// so we need to now remove them
1080         for (CBioseq_set::TSeq_set::iterator bioseq_it =
1081                  seqs.SetSeq_set().begin();
1082             bioseq_it != seqs.SetSeq_set().end(); )
1083         {
1084             if (((*bioseq_it)->GetSeq().IsNa() &&
1085                     !(m_flags & fForceTranscribeMrna)) ||
1086                 ((*bioseq_it)->GetSeq().IsAa() &&
1087                     !(m_flags & fForceTranslateCds)))
1088             {
1089                 bioseq_it = seqs.SetSeq_set().erase(bioseq_it);
1090             } else {
1091                 ++bioseq_it;
1092             }
1093         }
1094     }
1095 
1096     //collapse one interval packed-ints
1097     for (  CTypeIterator<CSeq_loc> loc(annot); loc; ++loc) {
1098         if (loc->IsPacked_int() && loc->GetPacked_int().Get().size()==1) {
1099             CRef<CSeq_interval> interval = loc->SetPacked_int().Set().front();
1100             loc->SetInt(*interval);
1101         }
1102     }
1103     return is_protein_align ? cds_feat_on_genome_with_translated_product : mrna_feat_on_genome_with_translated_product;
1104 }
1105 
1106 void
1107 CFeatureGenerator::
ConvertAlignToAnnot(const list<CRef<CSeq_align>> & aligns,CSeq_annot & annot,CBioseq_set & seqs)1108 SImplementation::ConvertAlignToAnnot(
1109                          const list< CRef<CSeq_align> > &aligns,
1110                          CSeq_annot &annot,
1111                          CBioseq_set &seqs)
1112 {
1113     CSeq_loc_Mapper::TMapOptions opts = 0;
1114     if (m_flags & fDensegAsExon) {
1115         opts |= CSeq_loc_Mapper::fAlign_Dense_seg_TotalRange;
1116     }
1117 
1118     CRef<CSeq_feat> gene_feat;
1119     CSeq_annot gene_annot;
1120     CSeq_id_Handle gene_handle;
1121     ITERATE(list< CRef<CSeq_align> >, align_it, aligns){
1122         CConstRef<CSeq_align> clean_align = CleanAlignment(**align_it);
1123         CRef<CSeq_feat> mrna_feat = ConvertAlignToAnnot(*clean_align, gene_annot, seqs, 0, NULL, true);
1124 
1125         SMapper mapper(*clean_align, *m_scope, m_allowed_unaligned, opts);
1126         const CSeq_id& genomic_id = clean_align->GetSeq_id(mapper.GetGenomicRow());
1127         const CSeq_id& rna_id = clean_align->GetSeq_id(mapper.GetRnaRow());
1128         if(!gene_handle)
1129             gene_handle = CSeq_id_Handle::GetHandle(genomic_id);
1130         else if(!(gene_handle == genomic_id))
1131             NCBI_THROW(CException, eUnknown,
1132                        "Bad list of alignments to ConvertAlignToAnnot(); alignments on different genes");
1133 
1134         CRef<CSeq_loc> loc(new CSeq_loc);
1135         loc->Assign(mapper.GetRnaLoc());
1136 
1137         CBioseq_Handle handle = m_scope->GetBioseqHandle(rna_id);
1138         x_CreateGeneFeature(gene_feat, handle, mapper, loc, genomic_id);
1139 
1140         x_CopyAdditionalFeatures(handle, mapper, gene_annot);
1141     }
1142     NON_CONST_ITERATE(CSeq_annot::C_Data::TFtable, feat_it, gene_annot.SetData().SetFtable())
1143     {
1144         x_CheckInconsistentDbxrefs(gene_feat, *feat_it);
1145     }
1146     gene_annot.SetData().SetFtable().push_front(gene_feat);
1147     RecomputePartialFlags(gene_annot);
1148     annot.SetData().SetFtable().splice(annot.SetData().SetFtable().end(),
1149                                        gene_annot.SetData().SetFtable());
1150 }
1151 
IsContinuous(const CSeq_loc & loc)1152 bool IsContinuous(const CSeq_loc& loc)
1153 {
1154     ITERATE (CSeq_loc, loc_it, loc) {
1155         if ((loc_it.GetRange().GetFrom() != loc.GetStart(eExtreme_Positional) && loc_it.GetRangeAsSeq_loc()->IsPartialStart(eExtreme_Positional)) ||
1156             (loc_it.GetRange().GetTo() != loc.GetStop(eExtreme_Positional) && loc_it.GetRangeAsSeq_loc()->IsPartialStop(eExtreme_Positional))) {
1157             return false;
1158         }
1159     }
1160     return true;
1161 }
1162 
AddLiteral(CSeq_inst & inst,const string & seq,CSeq_inst::EMol mol_class)1163 void AddLiteral(CSeq_inst& inst, const string& seq, CSeq_inst::EMol mol_class)
1164 {
1165     if (inst.IsSetExt()) {
1166         if (!inst.SetExt().SetDelta().Set().empty()) {
1167             CDelta_seq& delta_seq = *inst.SetExt().SetDelta().Set().back();
1168             if (delta_seq.IsLiteral() && delta_seq.GetLiteral().IsSetSeq_data()) {
1169                 string iupacna;
1170                 switch(delta_seq.GetLiteral().GetSeq_data().Which()) {
1171                 case CSeq_data::e_Iupacna:
1172                     iupacna = delta_seq.GetLiteral().GetSeq_data().GetIupacna();
1173                     break;
1174                 case CSeq_data::e_Ncbi2na:
1175                     CSeqConvert::Convert(delta_seq.GetLiteral().GetSeq_data().GetNcbi2na().Get(), CSeqUtil::e_Ncbi2na,
1176                                          0, delta_seq.GetLiteral().GetLength(), iupacna, CSeqUtil::e_Iupacna);
1177                     break;
1178                 case CSeq_data::e_Ncbi4na:
1179                     CSeqConvert::Convert(delta_seq.GetLiteral().GetSeq_data().GetNcbi4na().Get(), CSeqUtil::e_Ncbi4na,
1180                                          0, delta_seq.GetLiteral().GetLength(), iupacna, CSeqUtil::e_Iupacna);
1181                     break;
1182                 case CSeq_data::e_Ncbi8na:
1183                     CSeqConvert::Convert(delta_seq.GetLiteral().GetSeq_data().GetNcbi8na().Get(), CSeqUtil::e_Ncbi8na,
1184                                          0, delta_seq.GetLiteral().GetLength(), iupacna, CSeqUtil::e_Iupacna);
1185                     break;
1186                 case CSeq_data::e_Iupacaa:
1187                     delta_seq.SetLiteral().SetSeq_data().SetIupacaa().Set() += seq;
1188                     delta_seq.SetLiteral().SetLength() += seq.size();
1189                     return;
1190                 default:
1191                     inst.SetExt().SetDelta().AddLiteral(seq, mol_class);
1192                     return;
1193                 }
1194                 iupacna += seq;
1195                 delta_seq.SetLiteral().SetSeq_data().SetIupacna().Set(iupacna);
1196                 delta_seq.SetLiteral().SetLength(iupacna.size());
1197                 CSeqportUtil::Pack(&delta_seq.SetLiteral().SetSeq_data());
1198                 return;
1199             }
1200         }
1201         inst.SetExt().SetDelta().AddLiteral(seq, mol_class);
1202     } else {
1203         inst.SetSeq_data().SetIupacna().Set() += seq;
1204     }
1205 }
1206 
1207 void CFeatureGenerator::
x_CollectMrnaSequence(CSeq_inst & inst,const CSeq_align & align,const CSeq_loc & loc,bool add_unaligned_parts,bool mark_transcript_deletions,bool * has_gap,bool * has_indel)1208 SImplementation::x_CollectMrnaSequence(CSeq_inst& inst,
1209                                        const CSeq_align& align,
1210                                        const CSeq_loc& loc,
1211                                        bool add_unaligned_parts,
1212                                        bool mark_transcript_deletions,
1213                                        bool* has_gap,
1214                                        bool* has_indel)
1215 {
1216     /// set up the inst
1217     inst.SetMol(CSeq_inst::eMol_rna);
1218 
1219     // this is created as a transcription of the genomic location
1220 
1221     CSeq_loc_Mapper to_mrna(align, CSeq_loc_Mapper::eSplicedRow_Prod);
1222     CSeq_loc_Mapper to_genomic(align, CSeq_loc_Mapper::eSplicedRow_Gen);
1223 
1224     to_mrna.SetMergeAll();
1225     to_genomic.SetMergeAll();
1226 
1227     int seq_size = 0;
1228     int prev_product_to = -1;
1229     bool prev_fuzz = false;
1230 
1231     for (CSeq_loc_CI loc_it(loc,
1232                             CSeq_loc_CI::eEmpty_Skip,
1233                             CSeq_loc_CI::eOrder_Biological);
1234          loc_it;  ++loc_it) {
1235 
1236         CConstRef<CSeq_loc> exon = loc_it.GetRangeAsSeq_loc();
1237         CRef<CSeq_loc> mrna_loc = to_mrna.Map(*exon);
1238 
1239         if ((prev_product_to > -1  &&
1240              loc_it.GetRangeAsSeq_loc()->IsPartialStart(eExtreme_Biological))  ||
1241             prev_fuzz) {
1242             if (has_gap != NULL) {
1243                 *has_gap = true;
1244             }
1245             if (!inst.IsSetExt()) {
1246                 inst.SetExt().SetDelta().AddLiteral
1247                     (inst.GetSeq_data().GetIupacna().Get(),CSeq_inst::eMol_rna);
1248                 inst.ResetSeq_data();
1249             }
1250             int gap_len = add_unaligned_parts ? mrna_loc->GetTotalRange().GetFrom()-(prev_product_to+1) : 0;
1251             if (gap_len >= 0) {
1252             seq_size += gap_len;
1253             prev_product_to += gap_len;
1254             inst.SetExt().SetDelta().AddLiteral(gap_len);
1255             if (gap_len == 0)
1256                 inst.SetExt().SetDelta().Set().back()
1257                     ->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
1258             }
1259         }
1260 
1261         int part_count = 0;
1262         int mapped_exon_len = 0;
1263         for (CSeq_loc_CI part_it(*mrna_loc);  part_it;  ++part_it) {
1264             ++part_count;
1265             if (prev_product_to<0) {
1266                 prev_product_to = part_it.GetRange().GetFrom()-1;
1267                 if (add_unaligned_parts && part_it.GetRange().GetFrom() > 0) {
1268                     seq_size = part_it.GetRange().GetFrom();
1269                     inst.SetExt().SetDelta().AddLiteral(seq_size);
1270                 }
1271             }
1272             int deletion_len = part_it.GetRange().GetFrom()-(prev_product_to+1);
1273             /// If this is the first part of the mapped segment, the deletion is
1274             /// in the CDS location on the transcript; mark with Ns only if
1275             /// mark_transcript_deletions is set. If this is a later part, the
1276             /// deletion is in the transcript mapping to the genomic sequence;
1277             /// mark always
1278             if (deletion_len > 0) {
1279                 if (mark_transcript_deletions && part_count == 1) {
1280                     // check if the deletion is in the alignment or the original multupart cds
1281 
1282                     CSeq_loc deletion_loc;
1283                     deletion_loc.SetInt().SetId().Assign(part_it.GetSeq_id());
1284                     deletion_loc.SetInt().SetFrom(prev_product_to+1);
1285                     deletion_loc.SetInt().SetTo(part_it.GetRange().GetFrom()-1);
1286 
1287                     deletion_len -= (int)GetLength(*to_genomic.Map(deletion_loc), NULL);
1288                 }
1289 
1290                 if (deletion_len > 0 && (mark_transcript_deletions || part_count > 1)) {
1291                     if (has_indel != NULL) {
1292                         *has_indel = true;
1293                     }
1294                     string deletion(deletion_len, 'N');
1295                     AddLiteral(inst, deletion, CSeq_inst::eMol_rna);
1296                     seq_size += deletion.size();
1297                 }
1298             }
1299 
1300             CConstRef<CSeq_loc> part = part_it.GetRangeAsSeq_loc();
1301             CRef<CSeq_loc> genomic_loc = to_genomic.Map(*part);
1302 
1303             for (CSeq_loc_CI it(*genomic_loc); it; ++it) {
1304                 mapped_exon_len += it.GetRange().GetLength();
1305             }
1306 
1307             CSeqVector vec(*genomic_loc, *m_scope, CBioseq_Handle::eCoding_Iupac);
1308             string seq;
1309             vec.GetSeqData(0, vec.size(), seq);
1310 
1311             AddLiteral(inst, seq, CSeq_inst::eMol_rna);
1312 
1313             seq_size += vec.size();
1314 
1315             prev_product_to = part_it.GetRange().GetTo();
1316         }
1317         if (has_indel != NULL &&
1318             (part_count > 1 ||
1319              mapped_exon_len != loc_it.GetRange().GetLength())) {
1320             *has_indel = true;
1321         }
1322 
1323         prev_fuzz = loc_it.GetRangeAsSeq_loc()->IsPartialStop(eExtreme_Biological);
1324     }
1325 
1326     if (add_unaligned_parts && align.GetSegs().IsSpliced()) {
1327         const CSpliced_seg& spl = align.GetSegs().GetSpliced();
1328         if (spl.IsSetProduct_length()) {
1329             TSeqPos length = spl.GetProduct_length();
1330             if (seq_size < (int)length) {
1331                 if (!inst.IsSetExt()) {
1332                     inst.SetExt().SetDelta().AddLiteral
1333                         (inst.GetSeq_data().GetIupacna().Get(),CSeq_inst::eMol_rna);
1334                     inst.ResetSeq_data();
1335                 }
1336                 inst.SetExt().SetDelta().AddLiteral(length-seq_size);
1337                 seq_size = length;
1338             }
1339         }
1340     }
1341 
1342     inst.SetLength(seq_size);
1343     if (inst.IsSetExt()) {
1344         inst.SetRepr(CSeq_inst::eRepr_delta);
1345     } else {
1346         inst.SetRepr(CSeq_inst::eRepr_raw);
1347         CSeqportUtil::Pack(&inst.SetSeq_data());
1348     }
1349 }
1350 
1351 CRef<CSeq_id> CFeatureGenerator::
x_CreateMrnaBioseq(const CSeq_align & align,CConstRef<CSeq_loc> rna_feat_loc_on_genome,const CTime & time,size_t model_num,CBioseq_set & seqs,CConstRef<CSeq_feat> cds_feat_on_query_mrna,CRef<CSeq_feat> & cds_feat_on_transcribed_mrna)1352 SImplementation::x_CreateMrnaBioseq(const CSeq_align& align,
1353                                     CConstRef<CSeq_loc> rna_feat_loc_on_genome,
1354                                     const CTime& time,
1355                                     size_t model_num,
1356                                     CBioseq_set& seqs,
1357                                     CConstRef<CSeq_feat> cds_feat_on_query_mrna,
1358                                     CRef<CSeq_feat>& cds_feat_on_transcribed_mrna)
1359 {
1360     CRef<CSeq_entry> entry(new CSeq_entry);
1361     CBioseq& bioseq = entry->SetSeq();
1362 
1363     CRef<CSeqdesc> mdes(new CSeqdesc);
1364     entry->SetSeq().SetDescr().Set().push_back(mdes);
1365     mdes->SetMolinfo().SetBiomol(cds_feat_on_query_mrna.IsNull()
1366         ? CMolInfo::eBiomol_transcribed_RNA : CMolInfo::eBiomol_mRNA);
1367 
1368     CMolInfo::ECompleteness completeness;
1369     if (!IsContinuous(*rna_feat_loc_on_genome)) {
1370         completeness = CMolInfo::eCompleteness_partial;
1371     } else if (cds_feat_on_query_mrna.IsNull()) {
1372         completeness = CMolInfo::eCompleteness_unknown;
1373     } else if (cds_feat_on_query_mrna->GetLocation().IsPartialStart(eExtreme_Biological) &&
1374                cds_feat_on_query_mrna->GetLocation().IsPartialStop(eExtreme_Biological)
1375                ) {
1376         completeness = CMolInfo::eCompleteness_no_ends;
1377     } else if (cds_feat_on_query_mrna->GetLocation().IsPartialStart(eExtreme_Biological)) {
1378         completeness = CMolInfo::eCompleteness_no_left;
1379     } else if (cds_feat_on_query_mrna->GetLocation().IsPartialStop(eExtreme_Biological)) {
1380         completeness = CMolInfo::eCompleteness_no_right;
1381     } else {
1382         completeness = CMolInfo::eCompleteness_unknown;
1383     }
1384     mdes->SetMolinfo().SetCompleteness(completeness);
1385 
1386     x_CollectMrnaSequence(bioseq.SetInst(), align, *rna_feat_loc_on_genome);
1387 
1388     CRef<CSeq_align> assembly(new CSeq_align);
1389     assembly->Assign(align);
1390     bioseq.SetInst().SetHist().SetAssembly().push_back(assembly);
1391 
1392     CRef<CSeq_id> transcribed_rna_id(new CSeq_id);
1393     {{
1394         /// create a new seq-id for this
1395         string str("lcl|CDNA_");
1396         if ((m_flags & fGenerateStableLocalIds) == 0) {
1397             str += time.AsString("YMD");
1398             str += "_";
1399         }
1400         str += NStr::SizetToString(model_num);
1401         transcribed_rna_id->Set(str);
1402     }}
1403     bioseq.SetId().push_back(transcribed_rna_id);
1404 
1405     if (cds_feat_on_query_mrna.NotNull()) {
1406         CRef<CSeq_annot> annot(new CSeq_annot);
1407         entry->SetSeq().SetAnnot().push_back(annot);
1408         _ASSERT(cds_feat_on_query_mrna->GetData().Which() != CSeqFeatData::e_not_set);
1409 
1410         cds_feat_on_transcribed_mrna.Reset(new CSeq_feat);
1411         cds_feat_on_transcribed_mrna->Assign(*cds_feat_on_query_mrna);
1412         cds_feat_on_transcribed_mrna->SetLocation().SetId(*transcribed_rna_id);
1413 
1414         annot->SetData().SetFtable().push_back(cds_feat_on_transcribed_mrna);
1415 
1416         // remap code-breaks
1417         CSeqFeatData::TCdregion& cds =
1418             cds_feat_on_transcribed_mrna->SetData().SetCdregion();
1419         if (cds.IsSetCode_break()) {
1420             for (CCdregion::TCode_break::iterator it = cds.SetCode_break().begin();  it != cds.SetCode_break().end(); ++it) {
1421                 (*it)->SetLoc().SetId(*transcribed_rna_id);
1422             }
1423         }
1424     }
1425 
1426     if ((m_flags & fForceTranscribeMrna) && (m_flags & fForceTranslateCds)) {
1427         seqs.SetClass(CBioseq_set::eClass_nuc_prot);
1428     }
1429 
1430     seqs.SetSeq_set().push_back(entry);
1431 
1432     return transcribed_rna_id;
1433 }
1434 
AddCodeBreak(CSeq_feat & feat,CSeq_loc & loc,char ncbieaa)1435 void AddCodeBreak(CSeq_feat& feat, CSeq_loc& loc, char ncbieaa)
1436 {
1437     CRef<CCode_break> code_break(new CCode_break);
1438     code_break->SetLoc(loc);
1439     code_break->SetAa().SetNcbieaa(ncbieaa);
1440     if (feat.IsSetData() && feat.SetData().IsCdregion()) {
1441         feat.SetData().SetCdregion().SetCode_break().push_back(code_break);
1442     } else {
1443         NCBI_THROW(CException, eUnknown, "Adding code break to non-cdregion feature");
1444     }
1445 }
1446 
1447 const CBioseq& CFeatureGenerator::
x_CreateProteinBioseq(CSeq_loc * cds_loc,CRef<CSeq_feat> cds_feat_on_transcribed_mrna,list<CRef<CSeq_loc>> & transcribed_mrna_seqloc_refs,const CTime & time,size_t model_num,CBioseq_set & seqs)1448 SImplementation::x_CreateProteinBioseq(CSeq_loc* cds_loc,
1449                                        CRef<CSeq_feat> cds_feat_on_transcribed_mrna,
1450                                        list<CRef<CSeq_loc> >& transcribed_mrna_seqloc_refs,
1451                                        const CTime& time,
1452                                        size_t model_num,
1453                                        CBioseq_set& seqs)
1454 {
1455     CRef<CSeq_entry> entry(new CSeq_entry);
1456     CBioseq& bioseq = entry->SetSeq();
1457 
1458     // create a new seq-id for this
1459     string str("lcl|PROT_");
1460     if ((m_flags & fGenerateStableLocalIds) == 0) {
1461         str += time.AsString("YMD");
1462         str += "_";
1463     }
1464     str += NStr::SizetToString(model_num);
1465     CRef<CSeq_id> translated_protein_id(new CSeq_id(str));
1466     cds_feat_on_transcribed_mrna->SetProduct().SetWhole(*translated_protein_id);
1467 
1468     bioseq.SetId().push_back(translated_protein_id);
1469 
1470     CRef<CSeqdesc> desc(new CSeqdesc);
1471     desc->SetMolinfo().SetBiomol(CMolInfo::eBiomol_peptide);
1472 
1473     CMolInfo::ECompleteness completeness;
1474     if (!IsContinuous(*cds_loc)) {
1475         completeness = CMolInfo::eCompleteness_partial;
1476     } else if (cds_loc->IsPartialStart(eExtreme_Biological) &&
1477                cds_loc->IsPartialStop(eExtreme_Biological)
1478                ) {
1479         completeness = CMolInfo::eCompleteness_no_ends;
1480     } else if (cds_loc->IsPartialStart(eExtreme_Biological)) {
1481         completeness = CMolInfo::eCompleteness_no_left;
1482     } else if (cds_loc->IsPartialStop(eExtreme_Biological)) {
1483         completeness = CMolInfo::eCompleteness_no_right;
1484     } else {
1485         completeness = CMolInfo::eCompleteness_complete;
1486     }
1487     desc->SetMolinfo().SetCompleteness(completeness);
1488 
1489     bioseq.SetDescr().Set().push_back(desc);
1490 
1491     // set up the inst
1492 
1493     CSeq_entry_Handle mrna_seh = m_scope->AddTopLevelSeqEntry(*seqs.SetSeq_set().back());
1494 
1495     string strprot;
1496     CSeqTranslator::Translate(*cds_feat_on_transcribed_mrna, *m_scope, strprot, true, false);
1497 
1498     CRef<CSeq_loc> protloc_on_mrna(new CSeq_loc);
1499     protloc_on_mrna->Assign(cds_feat_on_transcribed_mrna->GetLocation());
1500     protloc_on_mrna->SetId(*const_cast<CSeq_id*>(cds_feat_on_transcribed_mrna->GetLocation().GetId()));
1501 
1502     // Remove final stop codon from sequence
1503     bool final_code_break = false;
1504     if (!protloc_on_mrna->IsPartialStop(eExtreme_Biological)) {
1505         final_code_break = (strprot[strprot.size()-1] != '*');
1506 
1507         strprot.resize(strprot.size()-1);
1508     }
1509 
1510     CSeq_inst& seq_inst = bioseq.SetInst();
1511     seq_inst.SetMol(CSeq_inst::eMol_aa);
1512 
1513         seq_inst.SetRepr(CSeq_inst::eRepr_delta);
1514         seq_inst.SetExt().SetDelta();
1515         CSeqVector seqv(*protloc_on_mrna, *m_scope, CBioseq_Handle::eCoding_Ncbi);
1516         CConstRef<CSeqMap> map;
1517         map.Reset(&seqv.GetSeqMap());
1518 
1519         const CCdregion& cdr = cds_feat_on_transcribed_mrna->GetData().GetCdregion();
1520         int frame = 0;
1521         if (cdr.IsSetFrame ()) {
1522             switch (cdr.GetFrame ()) {
1523             case CCdregion::eFrame_two :
1524                 frame = 1;
1525                 break;
1526             case CCdregion::eFrame_three :
1527                 frame = 2;
1528                 break;
1529             default :
1530                 break;
1531             }
1532         }
1533 
1534         bool starts_with_code_break = false;
1535         if (cdr.IsSetCode_break()) {
1536             ITERATE (CCdregion::TCode_break, it, cdr.GetCode_break()) {
1537                 if ((*it)->GetLoc().GetStart(eExtreme_Positional) == protloc_on_mrna->GetStart(eExtreme_Positional)) {
1538                     starts_with_code_break = true;
1539                     break;
1540                 }
1541             }
1542         }
1543 
1544         size_t b = 0;
1545         size_t e = 0;
1546         size_t skip_5_prime = 0;
1547         size_t skip_3_prime = 0;
1548         unsigned count_internal_stops = 0;
1549 
1550         for( CSeqMap_CI ci = map->BeginResolved(m_scope.GetPointer()); ci; ci.Next()) {
1551             int codon_start_pos = (int)ci.GetPosition() + frame;
1552             int len = int(ci.GetLength()) - frame;
1553             frame = len >=0 ? -(len%3) : -len;
1554             _ASSERT( -3 < frame && frame < 3 );
1555             len += frame;
1556             if (len==0) {
1557                 if (b==0 &&
1558                     (ci.IsUnknownLength() || !ci.IsSetData()) &&
1559                     cds_loc->IsPartialStart(eExtreme_Biological)) {
1560 
1561                     skip_5_prime += 1;
1562                     b += 1;
1563                     frame += 3;
1564                 }
1565                 continue;
1566             }
1567             e = b + len/3;
1568             bool stop_codon_included = e > strprot.size();
1569             if (stop_codon_included) {
1570                 _ASSERT( len%3 != 0 || !protloc_on_mrna->IsPartialStop(eExtreme_Biological) );
1571                 --e;
1572                 len = len >= 3 ? len-3 : 0;
1573             }
1574 
1575             // template for codon seq-locs
1576             CRef<CSeq_loc> codon_on_mrna = protloc_on_mrna->Merge(CSeq_loc::fMerge_SingleRange, NULL);
1577             codon_on_mrna->SetPartialStart(false, eExtreme_Biological);
1578             codon_on_mrna->SetPartialStop(false, eExtreme_Biological);
1579 
1580 
1581             if (ci.IsUnknownLength()) {
1582                 seq_inst.SetExt().SetDelta().AddLiteral(len);
1583                 seq_inst.SetExt().SetDelta().Set().back()->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
1584             } else if (!ci.IsSetData()) { // unaligned mRNA portion
1585                 if (b==skip_5_prime &&
1586                     cds_loc->IsPartialStart(eExtreme_Biological)) {
1587                     skip_5_prime += e-b;
1588                 } else if (stop_codon_included && b==e) {
1589                     // just stop codon
1590                     // do not add zero length gap
1591                 } else {
1592                     if (strprot[b] != 'X') { // preceding partial codon translated unambiguously - add it
1593                         AddLiteral(seq_inst, strprot.substr(b,1), CSeq_inst::eMol_aa);
1594                         b += 1;
1595                     }
1596                     if (b < e) {
1597                         seq_inst.SetExt().SetDelta().AddLiteral(e-b);
1598                     }
1599                 }
1600             } else {
1601                 if (stop_codon_included && final_code_break) {
1602                     TSeqPos pos_on_mrna = codon_start_pos + protloc_on_mrna->GetStart(eExtreme_Positional) + (e-b)*3;
1603                     CRef<CSeq_loc> stop_codon_on_mrna = codon_on_mrna->Merge(CSeq_loc::fMerge_SingleRange, NULL);
1604                     stop_codon_on_mrna->SetInt().SetFrom(pos_on_mrna);
1605                     stop_codon_on_mrna->SetInt().SetTo(pos_on_mrna + 2);
1606                     AddCodeBreak(*cds_feat_on_transcribed_mrna, *stop_codon_on_mrna, '*');
1607                     transcribed_mrna_seqloc_refs.push_back(stop_codon_on_mrna);
1608                 }
1609                 if (b < e) {
1610 
1611                     if (b==0 && strprot[b] != 'M' &&
1612                         !starts_with_code_break &&
1613                         !protloc_on_mrna->IsPartialStart(eExtreme_Biological)) {
1614                         strprot[b] = 'M';
1615                         TSeqPos pos_on_mrna = codon_start_pos + protloc_on_mrna->GetStart(eExtreme_Positional);
1616                         CRef<CSeq_loc> start_codon_on_mrna = codon_on_mrna->Merge(CSeq_loc::fMerge_SingleRange, NULL);
1617                         start_codon_on_mrna->SetInt().SetFrom(pos_on_mrna);
1618                         start_codon_on_mrna->SetInt().SetTo(pos_on_mrna + 2);
1619                         AddCodeBreak(*cds_feat_on_transcribed_mrna, *start_codon_on_mrna, 'M');
1620                         transcribed_mrna_seqloc_refs.push_back(start_codon_on_mrna);
1621                     }
1622 
1623                     // Repair any internal stops with Xs
1624                     size_t stop_aa_pos = b-1;
1625                     while ((stop_aa_pos = strprot.find('*', stop_aa_pos+1)) < e) {
1626                         strprot[stop_aa_pos] = 'X';
1627 
1628                         TSeqPos pos_on_mrna = codon_start_pos + protloc_on_mrna->GetStart(eExtreme_Positional) + (stop_aa_pos-b)*3;
1629                         CRef<CSeq_loc> internal_stop_on_mrna = codon_on_mrna->Merge(CSeq_loc::fMerge_SingleRange, NULL);
1630                         internal_stop_on_mrna->SetInt().SetFrom(pos_on_mrna);
1631                         internal_stop_on_mrna->SetInt().SetTo(pos_on_mrna + 2);
1632                         AddCodeBreak(*cds_feat_on_transcribed_mrna, *internal_stop_on_mrna, 'X');
1633                         transcribed_mrna_seqloc_refs.push_back(internal_stop_on_mrna);
1634                         ++count_internal_stops;
1635                     }
1636 
1637 
1638                     AddLiteral(seq_inst, strprot.substr(b,e-b), CSeq_inst::eMol_aa);
1639                 }
1640             }
1641             b = e;
1642         }
1643         _ASSERT( -2 <= frame && frame <= 0 );
1644 
1645         if (m_is_best_refseq && count_internal_stops) {
1646             CRef<CUser_object> align_info(new CUser_object);
1647             align_info->SetType().SetStr("AlignInfo");
1648             align_info->AddField("num_internal_stop_codon", (int)count_internal_stops);
1649             cds_feat_on_transcribed_mrna->AddExt(align_info);
1650         }
1651 
1652         if (frame < 0) { //last codon partial
1653             if (b < strprot.size() && strprot[b] != 'X') { // last partial codon translated unambiguously - add it
1654                 _ASSERT( b == strprot.size()-1 );
1655                 AddLiteral(seq_inst, strprot.substr(b,1), CSeq_inst::eMol_aa);
1656                 b += 1;
1657                 frame = 0;
1658             }
1659         }
1660 
1661         _ASSERT( b <= strprot.size() &&
1662                  strprot.size() <= b + (frame==0?0:1) );
1663 
1664     if (cds_loc->IsPartialStop(eExtreme_Biological)) {
1665         while (seq_inst.GetExt().GetDelta().Get().size() > 0 &&
1666                !seq_inst.GetExt().GetDelta().Get().back()->GetLiteral().IsSetSeq_data()) {
1667             skip_3_prime += seq_inst.GetExt().GetDelta().Get().back()->GetLiteral().GetLength();
1668             seq_inst.SetExt().SetDelta().Set().pop_back();
1669         }
1670     }
1671 
1672     if (skip_5_prime || skip_3_prime) {
1673         CSeq_loc_Mapper to_prot(*cds_feat_on_transcribed_mrna, CSeq_loc_Mapper::eLocationToProduct);
1674         CSeq_loc_Mapper to_mrna(*cds_feat_on_transcribed_mrna, CSeq_loc_Mapper::eProductToLocation);
1675 
1676         CRef<CSeq_loc> prot_loc = to_prot.Map(*protloc_on_mrna)->Merge(CSeq_loc::fMerge_SingleRange, NULL);
1677 
1678         prot_loc->SetInt().SetFrom(skip_5_prime);
1679         prot_loc->SetInt().SetTo(b-skip_3_prime-1+(skip_3_prime?0:1));
1680         prot_loc->SetPartialStart(skip_5_prime, eExtreme_Biological);
1681         prot_loc->SetPartialStop(skip_3_prime, eExtreme_Biological);
1682 
1683         cds_feat_on_transcribed_mrna->SetLocation(*to_mrna.Map(*prot_loc));
1684     }
1685 
1686     seq_inst.SetLength(b-skip_5_prime-skip_3_prime);
1687 
1688     if (seq_inst.SetExt().SetDelta().Set().size() == 1 && seq_inst.SetExt().SetDelta().Set().back()->GetLiteral().IsSetSeq_data()) {
1689         seq_inst.SetRepr(CSeq_inst::eRepr_raw);
1690         CRef<CSeq_data> dprot(new CSeq_data);
1691         dprot->Assign(seq_inst.SetExt().SetDelta().Set().back()->GetLiteral().GetSeq_data());
1692         seq_inst.SetSeq_data(*dprot);
1693         seq_inst.ResetExt();
1694     }
1695 
1696     if (m_flags & fAddTranslatedCDSAssembly) {
1697         CRef<CSeq_align> mrna_assembly = seqs.SetSeq_set().back()->GetSeq().GetInst().GetHist().GetAssembly().back();
1698 
1699         CRef<CSeq_feat> cds_feat_on_assembly_mrna(new CSeq_feat);
1700         cds_feat_on_assembly_mrna->Assign(*cds_feat_on_transcribed_mrna);
1701         cds_feat_on_assembly_mrna->SetLocation().SetId(mrna_assembly->GetSeq_id(0));
1702 
1703         if ( !cds_feat_on_assembly_mrna->GetLocation().IsPartialStop(eExtreme_Biological)) {
1704             cds_feat_on_assembly_mrna->SetLocation().SetInt().SetTo() -= 3;
1705         }
1706 
1707         CSeq_loc_Mapper to_prot(*cds_feat_on_assembly_mrna, CSeq_loc_Mapper::eLocationToProduct);
1708         CRef<CSeq_align> prot_assembly = to_prot.Map(*mrna_assembly);
1709 
1710         prot_assembly->SetSegs().SetSpliced().SetProduct_length(seq_inst.GetLength());
1711 
1712         seq_inst.SetHist().SetAssembly().push_back(prot_assembly);
1713     }
1714 
1715 #ifdef _DEBUG
1716     CSeq_entry_Handle prot_seh = m_scope->AddTopLevelSeqEntry(*entry);
1717 
1718     CBioseq_Handle prot_h = m_scope->GetBioseqHandle(*translated_protein_id);
1719     CSeqVector vec(prot_h, CBioseq_Handle::eCoding_Iupac);
1720     string result;
1721     vec.GetSeqData(0, vec.size(), result);
1722     _ASSERT( b-skip_5_prime-skip_3_prime==result.size() );
1723 
1724     m_scope->RemoveTopLevelSeqEntry(prot_seh);
1725 #endif
1726 
1727     m_scope->RemoveTopLevelSeqEntry(mrna_seh);
1728 
1729     seqs.SetSeq_set().push_back(entry);
1730     return bioseq;
1731 }
1732 
1733 CRef<CSeq_feat>
1734 CFeatureGenerator::
x_CreateMrnaFeature(CRef<CSeq_loc> loc,const CSeq_id & query_rna_id,CSeq_id & transcribed_rna_id,CConstRef<CSeq_feat> cds_feat_on_query_mrna)1735 SImplementation::x_CreateMrnaFeature(CRef<CSeq_loc> loc,
1736                                      const CSeq_id& query_rna_id,
1737                                      CSeq_id& transcribed_rna_id,
1738                                      CConstRef<CSeq_feat> cds_feat_on_query_mrna)
1739 {
1740     CRef<CSeq_feat> mrna_feat;
1741     if (m_flags & fCreateMrna) {
1742         mrna_feat.Reset(new CSeq_feat());
1743         CRNA_ref::TType type = CRNA_ref::eType_unknown;
1744         string name;
1745         string RNA_class;
1746 
1747         string gnomon_model_num = ExtractGnomonModelNum(query_rna_id);
1748         if (!gnomon_model_num.empty()) {
1749             CRef<CObject_id> obj_id( new CObject_id() );
1750             obj_id->SetStr("rna." + gnomon_model_num);
1751             CRef<CFeat_id> feat_id( new CFeat_id() );
1752             feat_id->SetLocal(*obj_id);
1753             mrna_feat->SetIds().push_back(feat_id);
1754         }
1755 
1756         mrna_feat->SetProduct().SetWhole().Assign(transcribed_rna_id);
1757         CBioseq_Handle handle = m_scope->GetBioseqHandle(query_rna_id);
1758         if (handle) {
1759             const CMolInfo* info = s_GetMolInfo(handle);
1760             if (info  &&  info->IsSetBiomol()) {
1761                 switch (info->GetBiomol()) {
1762                 case CMolInfo::eBiomol_mRNA:
1763                     type = CRNA_ref::eType_mRNA;
1764                     break;
1765                 case CMolInfo::eBiomol_rRNA:
1766                     type = CRNA_ref::eType_rRNA;
1767                     break;
1768                 case CMolInfo::eBiomol_tRNA:
1769                     type = CRNA_ref::eType_tRNA;
1770                     break;
1771                 case CMolInfo::eBiomol_snRNA:
1772                     type = CRNA_ref::eType_snRNA;
1773                     break;
1774                 case CMolInfo::eBiomol_snoRNA:
1775                     type = CRNA_ref::eType_snoRNA;
1776                     break;
1777                 case CMolInfo::eBiomol_scRNA:
1778                     type = CRNA_ref::eType_scRNA;
1779                     break;
1780                 case CMolInfo::eBiomol_pre_RNA:
1781                     type = CRNA_ref::eType_premsg;
1782                     break;
1783                 case CMolInfo::eBiomol_ncRNA:
1784                     type = CRNA_ref::eType_ncRNA;
1785                     if (info->IsSetGbmoltype()) {
1786                         RNA_class = info->GetGbmoltype();
1787                     }
1788                     break;
1789                 case CMolInfo::eBiomol_transcribed_RNA:
1790                     type = CRNA_ref::eType_miscRNA;
1791                     break;
1792                 default:
1793                     type = CRNA_ref::eType_other;
1794                     break;
1795                 }
1796             }
1797         } else {
1798             type = cds_feat_on_query_mrna.IsNull()
1799                 ? CRNA_ref::eType_miscRNA : CRNA_ref::eType_mRNA;
1800         }
1801 
1802         mrna_feat->SetData().SetRna().SetType(type);
1803         if (!RNA_class.empty()) {
1804             mrna_feat->SetData().SetRna().SetExt().SetGen().SetClass(RNA_class);
1805         }
1806         name = x_ConstructRnaName(handle);
1807         if (!name.empty()) {
1808             if (!RNA_class.empty()) {
1809                 mrna_feat->SetData().SetRna().SetExt().SetGen().SetProduct(name);
1810             } else {
1811                 mrna_feat->SetData().SetRna().SetExt().SetName(name);
1812             }
1813         }
1814 
1815         mrna_feat->SetLocation(*loc);
1816     }
1817     return mrna_feat;
1818 }
1819 
1820 void
1821 CFeatureGenerator::
x_CreateGeneFeature(CRef<CSeq_feat> & gene_feat,const CBioseq_Handle & handle,SMapper & mapper,CRef<CSeq_loc> loc,const CSeq_id & genomic_id,Int8 gene_id)1822 SImplementation::x_CreateGeneFeature(CRef<CSeq_feat> &gene_feat,
1823                                      const CBioseq_Handle& handle,
1824                                      SMapper& mapper,
1825                                      CRef<CSeq_loc> loc,
1826                                      const CSeq_id& genomic_id, Int8 gene_id)
1827 {
1828     if (m_flags & fCreateGene) {
1829         CFeat_CI feat_iter;
1830         if (handle) {
1831             feat_iter = CFeat_CI(handle, CSeqFeatData::eSubtype_gene);
1832         }
1833         bool update_existing_gene = gene_feat;
1834         string gene_id_str = "gene.";
1835         if (gene_id) {
1836             gene_id_str += NStr::NumericToString(gene_id);
1837         }
1838 
1839         if (!update_existing_gene) {
1840             if (feat_iter  &&  feat_iter.GetSize()) {
1841                 gene_feat.Reset(new CSeq_feat());
1842                 gene_feat->Assign(feat_iter->GetOriginalFeature());
1843             }
1844             if (!(m_flags & fPropagateOnly)) {
1845                 /// if we didn't find am existing gene feature, create one
1846                 if (!gene_feat) {
1847                     gene_feat.Reset(new CSeq_feat());
1848                     gene_feat->SetData().SetGene();
1849                 }
1850                 if (gene_id) {
1851                     CRef<CObject_id> obj_id( new CObject_id() );
1852                     obj_id->SetStr(gene_id_str);
1853                     CRef<CFeat_id> feat_id( new CFeat_id() );
1854                     feat_id->SetLocal(*obj_id);
1855                     gene_feat->SetIds().push_back(feat_id);
1856                 }
1857             }
1858         }
1859 
1860         if (!gene_feat) {
1861             /// Couldn't create gene feature
1862             return;
1863         }
1864 
1865         CRef<CSeq_loc> gene_loc;
1866         if (!(m_flags & fPropagateOnly)) {
1867             gene_loc = loc;
1868         } else if (feat_iter  &&  feat_iter.GetSize()) {
1869             gene_loc = mapper.Map(feat_iter->GetLocation());
1870         }
1871 
1872         if (gene_loc) {
1873             gene_feat->SetLocation
1874                 (*MergeSeq_locs(gene_loc,
1875                                 update_existing_gene ? &gene_feat->GetLocation() : NULL));
1876         }
1877 
1878         if (feat_iter  &&  feat_iter.GetSize() == 1 && update_existing_gene) {
1879             /// check if gene feature has any dbxrefs that we don't have yet
1880             if (feat_iter->IsSetDbxref()) {
1881                 ITERATE (CSeq_feat::TDbxref, xref_it,
1882                          feat_iter->GetDbxref()) {
1883                     CRef<CDbtag> tag(new CDbtag);
1884                     tag->Assign(**xref_it);
1885                     bool duplicate = false;
1886                     if(gene_feat->IsSetDbxref()){
1887                         /// Check for duplications
1888                         ITERATE(CSeq_feat::TDbxref, previous_xref_it,
1889                                 gene_feat->GetDbxref())
1890                             if((*previous_xref_it)->Match(**xref_it)){
1891                                 duplicate = true;
1892                                 break;
1893                             }
1894                     }
1895                     if(!duplicate)
1896                         gene_feat->SetDbxref().push_back(tag);
1897                 }
1898             }
1899         }
1900 
1901         if (gene_id) {
1902             /// Special case for gnomon, set gene desc from gnomon id
1903             gene_feat->SetData().SetGene().SetDesc(gene_id_str);
1904         }
1905     }
1906 }
1907 
1908 CRef<CSeq_feat>
1909 CFeatureGenerator::
x_CreateCdsFeature(CConstRef<CSeq_feat> cds_feat_on_query_mrna,CRef<CSeq_feat> cds_feat_on_transcribed_mrna,list<CRef<CSeq_loc>> & transcribed_mrna_seqloc_refs,const CSeq_align & align,CRef<CSeq_loc> loc,const CTime & time,size_t model_num,CBioseq_set & seqs,CSeq_loc_Mapper::TMapOptions opts)1910 SImplementation::x_CreateCdsFeature(CConstRef<CSeq_feat> cds_feat_on_query_mrna,
1911                                     CRef<CSeq_feat> cds_feat_on_transcribed_mrna,
1912                                     list<CRef<CSeq_loc> >& transcribed_mrna_seqloc_refs,
1913                                     const CSeq_align& align,
1914                                     CRef<CSeq_loc> loc,
1915                                     const CTime& time,
1916                                     size_t model_num,
1917                                     CBioseq_set& seqs,
1918                                     CSeq_loc_Mapper::TMapOptions opts)
1919 {
1920     CRef<CSeq_feat> cds_feat;
1921     if (!(m_flags & fCreateCdregion) || cds_feat_on_query_mrna.IsNull()) {
1922         return cds_feat;
1923     }
1924 
1925             TSeqPos offset;
1926             CRef<CSeq_feat> cds_feat_on_genome = x_MapFeature(cds_feat_on_query_mrna.GetNonNullPointer(),
1927                                                               align, loc, opts, offset);
1928             CRef<CSeq_loc> cds_loc;
1929             if (cds_feat_on_genome) {
1930                 cds_loc = &cds_feat_on_genome->SetLocation();
1931             }
1932             if (cds_loc  && cds_loc->Which() != CSeq_loc::e_not_set) {
1933                 CRangeCollection<TSeqPos> loc_ranges;
1934                 ITERATE (CSeq_loc, loc_it, *cds_loc) {
1935                     loc_ranges += loc_it.GetRange();
1936                 }
1937 
1938                 bool is_partial_5prime = offset > 0 || cds_loc->IsPartialStart(eExtreme_Biological);
1939                 cds_loc->SetPartialStart(is_partial_5prime, eExtreme_Biological);
1940 
1941                 string gnomon_model_num;
1942 
1943                 if (cds_feat_on_query_mrna->CanGetProduct()) {
1944                     gnomon_model_num = ExtractGnomonModelNum(
1945                         *cds_feat_on_query_mrna->GetProduct().GetId());
1946                 }
1947                     /// create a new bioseq for the CDS
1948                     if (!gnomon_model_num.empty()) {
1949                         CRef<CObject_id> obj_id( new CObject_id() );
1950                         obj_id->SetStr("cds." + gnomon_model_num);
1951                         CRef<CFeat_id> feat_id( new CFeat_id() );
1952                         feat_id->SetLocal(*obj_id);
1953                         cds_feat_on_transcribed_mrna->SetIds().push_back(feat_id);
1954                     }
1955                     x_CreateProteinBioseq(cds_loc, cds_feat_on_transcribed_mrna,
1956                                           transcribed_mrna_seqloc_refs,
1957                                           time, model_num, seqs);
1958 
1959                cds_feat.Reset(new CSeq_feat());
1960                cds_feat->Assign(*cds_feat_on_transcribed_mrna);
1961                cds_feat->ResetId();
1962 
1963                cds_feat->SetLocation(*cds_loc);
1964 
1965                 /// make sure we set the CDS frame correctly
1966                 /// if we're 5' partial, we may need to adjust the frame
1967                 /// to ensure that conceptual translations are in-frame
1968                 if (is_partial_5prime && offset) {
1969                     int orig_frame = 0;
1970                     if (cds_feat->GetData().GetCdregion().IsSetFrame()) {
1971                         orig_frame = cds_feat->GetData()
1972                             .GetCdregion().GetFrame();
1973                         if (orig_frame) {
1974                             orig_frame -= 1;
1975                         }
1976                     }
1977                     int frame = (offset - orig_frame) % 3;
1978                     if (frame < 0) {
1979                         frame = -frame;
1980                     }
1981                     frame = (3 - frame) % 3;
1982                     if (frame != orig_frame) {
1983                         switch (frame) {
1984                         case 0:
1985                             cds_feat->SetData().SetCdregion()
1986                                 .SetFrame(CCdregion::eFrame_one);
1987                             break;
1988                         case 1:
1989                             cds_feat->SetData().SetCdregion()
1990                                 .SetFrame(CCdregion::eFrame_two);
1991                             break;
1992                         case 2:
1993                             cds_feat->SetData().SetCdregion()
1994                                 .SetFrame(CCdregion::eFrame_three);
1995                             break;
1996 
1997                         default:
1998                             NCBI_THROW(CException, eUnknown,
1999                                        "mod 3 out of bounds");
2000                         }
2001                     }
2002                 }
2003 
2004                 if (!gnomon_model_num.empty() && !is_partial_5prime) {
2005                     int cds_start = cds_feat_on_transcribed_mrna->GetLocation().GetTotalRange().GetFrom();
2006                     if (cds_start >= 3) {
2007                         CBioseq_Handle rna_handle =
2008                             m_scope->GetBioseqHandle(*cds_feat_on_transcribed_mrna->GetLocation().GetId());
2009 
2010                         string strprot;
2011                         if (rna_handle) {
2012                             CSeqVector vec(rna_handle, CBioseq_Handle::eCoding_Iupac);
2013                             string mrna;
2014                             vec.GetSeqData(cds_start % 3, cds_start, mrna);
2015                             const CGenetic_code *code = NULL;
2016                             if (cds_feat_on_transcribed_mrna->GetData().GetCdregion().IsSetCode()) {
2017                                 code = &cds_feat_on_transcribed_mrna->GetData().GetCdregion().GetCode();
2018                             }
2019                             CSeqTranslator::Translate
2020                                 (mrna, strprot,
2021                                  CSeqTranslator::fIs5PrimePartial, code);
2022                         }
2023                         SIZE_TYPE stop_5prime = strprot.rfind('*');
2024                         if (stop_5prime != NPOS) {
2025                             stop_5prime = stop_5prime*3+cds_start%3;
2026                             CRef<CSeq_feat> stop_5prime_feature(new CSeq_feat);
2027                             stop_5prime_feature->SetData().SetImp().SetKey("misc_feature");
2028                             stop_5prime_feature->SetComment("upstream in-frame stop codon");
2029                             CRef<CSeq_loc> stop_5prime_location(new CSeq_loc());
2030                             stop_5prime_location->SetInt().SetFrom(stop_5prime);
2031                             stop_5prime_location->SetInt().SetTo(stop_5prime+2);
2032                             stop_5prime_location->SetInt().SetStrand(eNa_strand_plus);
2033                             stop_5prime_location->SetId(*rna_handle.GetSeqId());
2034                             stop_5prime_feature->SetLocation(*stop_5prime_location);
2035 
2036                             SAnnotSelector sel(CSeq_annot::C_Data::e_Ftable);
2037                             sel.SetResolveNone();
2038                             CAnnot_CI it(rna_handle, sel);
2039                             it->GetEditHandle().AddFeat(*stop_5prime_feature);
2040                         }
2041                     }
2042                 }
2043 
2044                 /// also copy the code break if it exists
2045                 if (cds_feat->GetData().GetCdregion().IsSetCode_break()) {
2046                     SMapper mapper(align, *m_scope, 0, opts);
2047                     mapper.IncludeSourceLocs();
2048                     mapper.SetMergeNone();
2049 
2050                     CSeqFeatData::TCdregion& cds =
2051                         cds_feat->SetData().SetCdregion();
2052                     CCdregion::TCode_break::iterator it =
2053                         cds.SetCode_break().begin();
2054                     for ( ;  it != cds.SetCode_break().end();  ) {
2055                         CSeq_loc code_break_loc;
2056                         code_break_loc.Assign((*it)->GetLoc());
2057                         code_break_loc.SetId(align.GetSeq_id(0)); // set query mrna id - the mapper maps from query mrna to genome
2058                         CRef<CSeq_loc> new_cb_loc = mapper.Map(code_break_loc);
2059 
2060                         // we may get an equiv. If we do, get just mapped loc
2061                         if (new_cb_loc->IsEquiv()) {
2062                             new_cb_loc = new_cb_loc->GetEquiv().Get().front();
2063                         }
2064 
2065                         CRangeCollection<TSeqPos> new_cb_ranges;
2066                         if (new_cb_loc && !new_cb_loc->IsNull()) {
2067                             ITERATE (CSeq_loc, loc_it, *new_cb_loc) {
2068                                 new_cb_ranges += loc_it.GetRange();
2069                             }
2070                             new_cb_ranges &= loc_ranges;
2071                         }
2072                         if (new_cb_ranges.GetCoveredLength() == 3) {
2073                             (*it)->SetLoc(*new_cb_loc);
2074                             ++it;
2075                         } else {
2076                             it = cds.SetCode_break().erase(it);
2077                         }
2078                     }
2079                     if (cds.GetCode_break().empty()) {
2080                         cds.ResetCode_break();
2081                     }
2082                 }
2083 
2084             }
2085 
2086     return cds_feat;
2087 }
2088 
2089 string CFeatureGenerator::
x_ConstructRnaName(const CBioseq_Handle & handle)2090 SImplementation::x_ConstructRnaName(const CBioseq_Handle& handle)
2091 {
2092     string name;
2093     if (handle) {
2094         name = sequence::CDeflineGenerator().GenerateDefline(handle);
2095         try {
2096             const COrg_ref &org = sequence::GetOrg_ref(handle);
2097             if (org.IsSetTaxname() && NStr::StartsWith(name, org.GetTaxname())) {
2098                 name.erase(0, org.GetTaxname().size());
2099             }
2100         }
2101         catch (CException&) {
2102         }
2103         NStr::ReplaceInPlace(name, ", nuclear gene encoding mitochondrial protein",
2104                                    "");
2105         CFeat_CI feat_iter(handle, CSeqFeatData::eSubtype_gene);
2106         if (feat_iter && feat_iter.GetSize() &&
2107             feat_iter->GetData().GetGene().IsSetLocus())
2108         {
2109             NStr::ReplaceInPlace(
2110                 name, " (" + feat_iter->GetData().GetGene().GetLocus() + ')', "");
2111         }
2112         size_t last_comma = name.rfind(',');
2113         if (last_comma != string::npos) {
2114             name.erase(last_comma);
2115         }
2116         NStr::TruncateSpacesInPlace(name);
2117     }
2118     return name;
2119 }
2120 
2121 CRef<CSeq_feat>
2122 CFeatureGenerator::
x_CreateNcRnaFeature(const CSeq_feat * ncrnafeature_on_mrna,const CSeq_align & align,CConstRef<CSeq_loc> loc,CSeq_loc_Mapper::TMapOptions opts)2123 SImplementation::x_CreateNcRnaFeature(const CSeq_feat* ncrnafeature_on_mrna,
2124                                       const CSeq_align& align,
2125                                       CConstRef<CSeq_loc> loc,
2126                                       CSeq_loc_Mapper::TMapOptions opts)
2127 {
2128     CRef<CSeq_feat> ncrna_feat;
2129     if ((m_flags & fPropagateNcrnaFeats) && ncrnafeature_on_mrna != NULL) {
2130 
2131         TSeqPos offset;
2132         CRef<CSeq_loc> non_const_loc(new CSeq_loc); // x_MapFeature requires non-const loc
2133         non_const_loc->Assign(*loc);
2134         ncrna_feat = x_MapFeature(ncrnafeature_on_mrna,
2135                                   align, non_const_loc, opts, offset);
2136     }
2137     return ncrna_feat;
2138 }
2139 
2140 namespace {
ChangeToMix(const CSeq_loc & a)2141 CRef<CSeq_loc> ChangeToMix(const CSeq_loc& a)
2142 {
2143     CRef<CSeq_loc> a_mix(new CSeq_loc);
2144     a_mix->Assign(a);
2145     a_mix->ChangeToMix();
2146     return a_mix;
2147 }
2148 
SubtractPreserveBiologicalOrder(const CSeq_loc & a,const CSeq_loc & b)2149 CRef<CSeq_loc> SubtractPreserveBiologicalOrder(const CSeq_loc& a, const CSeq_loc& b)
2150 {
2151     CRef<CSeq_loc> a_mix = ChangeToMix(a);
2152     CRef<CSeq_loc> b_mix = ChangeToMix(b);
2153 
2154           list< CRef< CSeq_loc > >& a_list = a_mix->SetMix().Set();
2155     const list< CRef< CSeq_loc > >& b_list = b_mix->GetMix().Get();
2156 
2157     ITERATE (list< CRef< CSeq_loc > >, b_i, b_list) {
2158         for (list< CRef< CSeq_loc > >::iterator a_i = a_list.begin(); a_i != a_list.end();) {
2159 
2160             CRef<CSeq_loc> diff = ChangeToMix(*(*a_i)->Subtract(**b_i, CSeq_loc::fSort, nullptr, nullptr));
2161             a_list.splice(a_i, diff->SetMix().Set());
2162             a_i = a_list.erase(a_i);
2163         }
2164     }
2165     if (a_list.size() == 1) {
2166         return a_list.front();
2167     }
2168     a_mix->ChangeToPackedInt();
2169     return a_mix;
2170 }
2171 }
2172 
2173 CRef<CSeq_feat>
2174 CFeatureGenerator::
x_MapFeature(const objects::CSeq_feat * feature_on_mrna,const CSeq_align & align,CRef<CSeq_loc> loc,CSeq_loc_Mapper::TMapOptions opts,TSeqPos & offset)2175 SImplementation::x_MapFeature(const objects::CSeq_feat* feature_on_mrna,
2176                                             const CSeq_align& align,
2177                                             CRef<CSeq_loc> loc,
2178                                             CSeq_loc_Mapper::TMapOptions opts,
2179                                             TSeqPos &offset)
2180 {
2181     // from this point on, we will get complex locations back
2182     SMapper mapper(align, *m_scope, 0, opts);
2183     mapper.IncludeSourceLocs();
2184     mapper.SetMergeNone();
2185 
2186     CRef<CSeq_loc> mapped_loc;
2187 
2188     ///
2189     /// in general, feature has only one segment on mRNA; there are some corner cases
2190     /// (OAZ1, OAZ2, PAZ3, PEG10) in which there are more than one
2191     /// segment.
2192     /// we map each segment separately because we want to stitch genomic insertions,
2193     /// but not segment boundaries.
2194     ///
2195     for (CSeq_loc_CI loc_it(feature_on_mrna->GetLocation());
2196          loc_it;  ++loc_it) {
2197         /// location for this interval
2198         CConstRef<CSeq_loc> this_loc = loc_it.GetRangeAsSeq_loc();
2199 
2200         /// map it
2201         CRef<CSeq_loc> equiv = mapper.Map(*this_loc);
2202         if ( !equiv  ||
2203              equiv->IsNull()  ||
2204              equiv->IsEmpty() ) {
2205             continue;
2206         }
2207 
2208         /// we are using a special variety that will tell us what
2209         /// portion really mapped
2210         ///
2211         /// the first part is the mapped location
2212 
2213         if (equiv->GetEquiv().Get().size() != 2) {
2214             NCBI_THROW(CException, eUnknown,
2215                        "failed to find requisite parts of "
2216                        "mapped seq-loc");
2217         }
2218         CRef<CSeq_loc> this_loc_mapped =
2219             equiv->GetEquiv().Get().front();
2220         if ( !this_loc_mapped  ||
2221              this_loc_mapped->IsNull()  ||
2222              this_loc_mapped->IsEmpty() ) {
2223             continue;
2224         }
2225 
2226         if ( !mapped_loc ) {
2227             mapped_loc.Reset(new CSeq_loc);
2228             /// This is start of mapped location; record offset
2229             offset = equiv->GetEquiv().Get().back()->GetTotalRange().GetFrom() -
2230                      feature_on_mrna->GetLocation().GetTotalRange().GetFrom();
2231         }
2232 
2233         bool is_partial_5prime =
2234             this_loc_mapped->IsPartialStart(eExtreme_Biological);
2235         bool is_partial_3prime =
2236             this_loc_mapped->IsPartialStop(eExtreme_Biological);
2237 
2238         CSeq_loc_CI it1 = loc_it;
2239         bool last_range = !++it1;
2240         if (is_partial_3prime && last_range &&
2241             align.GetSegs().IsSpliced() &&
2242             align.GetSegs().GetSpliced().IsSetPoly_a() &&
2243             feature_on_mrna->GetData().IsCdregion() &&
2244             !this_loc->IsPartialStop(eExtreme_Biological))
2245         {
2246             TSeqPos missing_end =
2247                 this_loc->GetTotalRange().GetTo() -
2248                 equiv->GetEquiv().Get().back()->GetTotalRange().GetTo();
2249             if (missing_end < 3) {
2250                 /// alignment truncates last one or two bases of CDS; stop codon
2251                 /// completed by poly-a tail. This should not be annotated as partial
2252                 is_partial_3prime = false;
2253             }
2254         }
2255 
2256         /// stitch genomic insertions
2257         /// we take the extreme bounds of the interval only;
2258         /// internal details will be recomputed based on intersection
2259         /// with the mRNA location
2260 
2261         ENa_strand strand = this_loc_mapped->GetStrand();
2262         CSeq_loc sub;
2263         sub.SetInt().SetFrom(this_loc_mapped->GetStart(eExtreme_Positional));
2264         sub.SetInt().SetTo(this_loc_mapped->GetStop(eExtreme_Positional));
2265         sub.SetInt().SetStrand(loc->GetStrand());
2266         sub.SetInt().SetId().Assign(*this_loc_mapped->GetId());
2267 
2268         int left = sub.GetInt().GetFrom();
2269         int right = sub.GetInt().GetTo();
2270         bool cross_origin = (left > right);
2271         if (cross_origin) {
2272 
2273             TSeqPos genomic_size = m_scope->GetSequenceLength(*this_loc_mapped->GetId());
2274 
2275             CRef<CSeq_interval> half(new CSeq_interval);
2276             half->Assign(sub.GetInt());
2277             half->SetTo(genomic_size-1);
2278             sub.SetPacked_int().AddInterval(*half);
2279             half->SetFrom(0);
2280             half->SetTo(right);
2281             sub.SetPacked_int().AddInterval(*half);
2282         }
2283 
2284         this_loc_mapped = loc->Intersect(sub,
2285                                          CSeq_loc::fSort,
2286                                          NULL);
2287         this_loc_mapped->SetStrand(strand);
2288 
2289         if (this_loc_mapped->IsMix()) {
2290             /// Propagate any internal fuzzy boundaries on the mRNA to the CDS
2291             set<TSeqPos> mrna_fuzzy_boundaries;
2292             ITERATE (CSeq_loc, subloc_it, *loc) {
2293                 if (subloc_it.GetRangeAsSeq_loc()->
2294                         IsPartialStart(eExtreme_Positional))
2295                 {
2296                     mrna_fuzzy_boundaries.insert(
2297                         subloc_it.GetRange().GetFrom());
2298                 }
2299                 if (subloc_it.GetRangeAsSeq_loc()->
2300                         IsPartialStop(eExtreme_Positional))
2301                 {
2302                     mrna_fuzzy_boundaries.insert(
2303                         subloc_it.GetRange().GetTo());
2304                 }
2305             }
2306 
2307             NON_CONST_ITERATE (CSeq_loc_mix::Tdata, subloc_it,
2308                                this_loc_mapped->SetMix().Set())
2309             {
2310                 (*subloc_it)->SetPartialStart(
2311                      mrna_fuzzy_boundaries.count(
2312                          (*subloc_it)->GetStart(eExtreme_Positional)),
2313                      eExtreme_Positional);
2314                 (*subloc_it)->SetPartialStop(
2315                      mrna_fuzzy_boundaries.count(
2316                          (*subloc_it)->GetStop(eExtreme_Positional)),
2317                      eExtreme_Positional);
2318             }
2319         }
2320 
2321         if (cross_origin) {
2322             this_loc_mapped = FixOrderOfCrossTheOriginSeqloc(*this_loc_mapped,
2323                                                              (left+right)/2);
2324         }
2325         this_loc_mapped->SetPartialStart(is_partial_5prime, eExtreme_Biological);
2326         this_loc_mapped->SetPartialStop(is_partial_3prime, eExtreme_Biological);
2327 
2328         mapped_loc->SetMix().Set().push_back(this_loc_mapped);
2329     }
2330     if (mapped_loc) {
2331         mapped_loc->ChangeToPackedInt();
2332         mapped_loc->SetId(*loc->GetId());
2333     }
2334 
2335     if (loc->GetStart(eExtreme_Positional) > loc->GetStop(eExtreme_Positional)) {
2336         mapped_loc = FixOrderOfCrossTheOriginSeqloc(*mapped_loc,
2337                                                     (loc->GetStart(eExtreme_Positional) + loc->GetStop(eExtreme_Positional))/2);
2338     }
2339 
2340     if (mapped_loc && feature_on_mrna->GetData().IsRna())
2341     {
2342         if (mapped_loc->IsPartialStop(eExtreme_Biological) &&
2343             !feature_on_mrna->GetLocation().IsPartialStop(eExtreme_Biological) &&
2344             align.GetSegs().IsSpliced() &&
2345             align.GetSegs().GetSpliced().CanGetPoly_a())
2346         {
2347             /// When propagaring RNA feature, don't create fuzz at 3' end if
2348             /// alignment has poly-a flag
2349             mapped_loc->SetPartialStop(false, eExtreme_Biological);
2350         }
2351         if ((mapped_loc->IsPartialStart(eExtreme_Biological) &&
2352              !feature_on_mrna->GetLocation().IsPartialStart(eExtreme_Biological)) ||
2353             (mapped_loc->IsPartialStop(eExtreme_Biological) &&
2354              !feature_on_mrna->GetLocation().IsPartialStop(eExtreme_Biological)))
2355         {
2356             CSeq_loc_Mapper reverse_mapper(align, 0, m_scope.GetPointer(), opts);
2357             CSeq_id &mapped_loc_id = const_cast<CSeq_id &>(*mapped_loc->GetId());
2358             TSignedSeqPos feat_start = feature_on_mrna->GetLocation().GetStart(eExtreme_Biological);
2359             CSeq_loc start_loc(mapped_loc_id, mapped_loc->GetStart(eExtreme_Biological));
2360             TSignedSeqPos mapped_start = reverse_mapper.Map(start_loc)->GetStart(eExtreme_Biological);
2361             if (!feature_on_mrna->GetLocation().IsPartialStart(eExtreme_Biological) &&
2362                 TSeqPos(abs(feat_start - mapped_start)) <= m_allowed_unaligned)
2363             {
2364                 /// No fuzz in original, and overhang is within limits; shouldn't have fuzz
2365                 mapped_loc->SetPartialStart(false, eExtreme_Biological);
2366             }
2367             TSignedSeqPos feat_stop = feature_on_mrna->GetLocation().GetStop(eExtreme_Biological);
2368             CSeq_loc stop_loc(mapped_loc_id, mapped_loc->GetStop(eExtreme_Biological));
2369             TSignedSeqPos mapped_stop = reverse_mapper.Map(stop_loc)->GetStop(eExtreme_Biological);
2370             if (!feature_on_mrna->GetLocation().IsPartialStop(eExtreme_Biological) &&
2371                 TSeqPos(abs(feat_stop - mapped_stop)) <= m_allowed_unaligned)
2372             {
2373                 /// No fuzz in original, and overhang is within limits; shouldn't have fuzz
2374                 mapped_loc->SetPartialStop(false, eExtreme_Biological);
2375             }
2376         }
2377     }
2378 
2379     if (mapped_loc && feature_on_mrna->GetData().IsCdregion()) {
2380         /// For CDS features, trim not to begin/end in gaps
2381         /// Trim beginning
2382         CSeqVector vec(*mapped_loc, *m_scope);
2383         TSeqPos start_gap = 0;
2384         for (; vec.IsInGap(start_gap); ++start_gap);
2385         if (start_gap > 0 && start_gap < vec.size()) {
2386             offset += start_gap;
2387 
2388             CSeq_loc orig_mapped_loc;
2389 
2390             bool no_utr = mapped_loc->GetStart(eExtreme_Biological) == loc->GetStart(eExtreme_Biological);
2391             if (no_utr) {
2392                 orig_mapped_loc.Assign(*mapped_loc);
2393             }
2394 
2395             while (mapped_loc->SetPacked_int().Set().front()->GetLength()
2396                         <= start_gap)
2397             {
2398                 start_gap -= mapped_loc->SetPacked_int().Set().front()->GetLength();
2399                 mapped_loc->SetPacked_int().Set().pop_front();
2400             }
2401             if (start_gap) {
2402                 CSeq_interval &first_exon =
2403                     *mapped_loc->SetPacked_int().Set().front();
2404                 if (first_exon.GetStrand() == eNa_strand_minus) {
2405                     first_exon.SetTo() -= start_gap;
2406                 } else {
2407                     first_exon.SetFrom() += start_gap;
2408                 }
2409             }
2410             mapped_loc->SetPartialStart(true, eExtreme_Biological);
2411 
2412             if (no_utr) {
2413                 loc->Assign(*SubtractPreserveBiologicalOrder(*loc, *SubtractPreserveBiologicalOrder(orig_mapped_loc, *mapped_loc)));
2414                 loc->SetPartialStart(true, eExtreme_Biological);
2415             }
2416         }
2417         TSeqPos end_gap = 0;
2418         for (; vec.IsInGap(vec.size() - 1 - end_gap); ++end_gap);
2419         if (end_gap > 0 && end_gap < vec.size()) {
2420             CSeq_loc orig_mapped_loc;
2421 
2422             bool no_utr = mapped_loc->GetStop(eExtreme_Biological) == loc->GetStop(eExtreme_Biological);
2423             if (no_utr) {
2424                 orig_mapped_loc.Assign(*mapped_loc);
2425             }
2426 
2427             while (mapped_loc->SetPacked_int().Set().back()->GetLength() <= end_gap)
2428             {
2429                 end_gap -= mapped_loc->SetPacked_int().Set().back()->GetLength();
2430                 mapped_loc->SetPacked_int().Set().pop_back();
2431             }
2432             if (end_gap) {
2433                 CSeq_interval &last_exon =
2434                     *mapped_loc->SetPacked_int().Set().back();
2435                 if (last_exon.GetStrand() == eNa_strand_minus) {
2436                     last_exon.SetFrom() += end_gap;
2437                 } else {
2438                     last_exon.SetTo() -= end_gap;
2439                 }
2440             }
2441             mapped_loc->SetPartialStop(true, eExtreme_Biological);
2442             if (no_utr) {
2443                 loc->Assign(*SubtractPreserveBiologicalOrder(*loc, *SubtractPreserveBiologicalOrder(orig_mapped_loc, *mapped_loc)));
2444                 loc->SetPartialStop(true, eExtreme_Biological);
2445             }
2446         }
2447     }
2448 
2449     CRef<CSeq_feat> mapped_feat;
2450     if (mapped_loc  && mapped_loc->Which() != CSeq_loc::e_not_set) {
2451         mapped_feat.Reset(new CSeq_feat());
2452         mapped_feat->Assign(*feature_on_mrna);
2453         mapped_feat->ResetId();
2454 
2455         mapped_feat->SetLocation(*mapped_loc);
2456     }
2457     return mapped_feat;
2458 }
2459 
2460 void CFeatureGenerator::
SetPartialFlags(CRef<CSeq_feat> gene_feat,CRef<CSeq_feat> mrna_feat,CRef<CSeq_feat> propagated_feat)2461 SImplementation::SetPartialFlags(CRef<CSeq_feat> gene_feat,
2462                                  CRef<CSeq_feat> mrna_feat,
2463                                  CRef<CSeq_feat> propagated_feat)
2464 {
2465     if(propagated_feat){
2466         for (CSeq_loc_CI loc_it(propagated_feat->GetLocation());  loc_it;  ++loc_it) {
2467             if (loc_it.GetRangeAsSeq_loc()->IsPartialStart(eExtreme_Biological)  ||  loc_it.GetRangeAsSeq_loc()->IsPartialStop(eExtreme_Biological)) {
2468                 propagated_feat->SetPartial(true);
2469                 if(gene_feat)
2470                     gene_feat->SetPartial(true);
2471                 break;
2472             }
2473         }
2474     }
2475 
2476     ///
2477     /// partial flags may require a global analysis - we may need to mark some
2478     /// locations partial even if they are not yet partial
2479     ///
2480     if (mrna_feat  &&  propagated_feat)
2481     {
2482         /// in addition to marking the mrna feature partial, we must mark the
2483         /// location partial to match the partialness in the CDS
2484         CSeq_loc& propagated_feat_loc = propagated_feat->SetLocation();
2485         CSeq_loc& mrna_loc = mrna_feat->SetLocation();
2486         if (propagated_feat_loc.IsPartialStart(eExtreme_Biological) &&
2487             propagated_feat_loc.GetStart(eExtreme_Biological) ==
2488             mrna_loc.GetStart(eExtreme_Biological)) {
2489             mrna_loc.SetPartialStart(true, eExtreme_Biological);
2490         }
2491         if (propagated_feat_loc.IsPartialStop(eExtreme_Biological) &&
2492             propagated_feat_loc.GetStop(eExtreme_Biological) ==
2493             mrna_loc.GetStop(eExtreme_Biological)) {
2494             mrna_loc.SetPartialStop(true, eExtreme_Biological);
2495         }
2496     }
2497 
2498     /// set the partial flag for mrna_feat if it has any fuzzy intervals
2499     if(mrna_feat){
2500         for (CSeq_loc_CI loc_it(mrna_feat->GetLocation());  loc_it;  ++loc_it) {
2501             if (loc_it.GetRangeAsSeq_loc()->IsPartialStart(eExtreme_Biological)  ||  loc_it.GetRangeAsSeq_loc()->IsPartialStop(eExtreme_Biological)) {
2502                 mrna_feat->SetPartial(true);
2503                 if(gene_feat)
2504                     gene_feat->SetPartial(true);
2505                 break;
2506             }
2507         }
2508     }
2509 
2510     ///
2511     /// set gene partialness if mRNA is partial
2512     if (gene_feat  &&  mrna_feat){
2513         CSeq_loc& mrna_loc = mrna_feat->SetLocation();
2514         CSeq_loc& gene_loc = gene_feat->SetLocation();
2515         if (mrna_loc.IsPartialStart(eExtreme_Biological)) {
2516             gene_loc.SetPartialStart(true, eExtreme_Biological);
2517         }
2518         if (mrna_loc.IsPartialStop(eExtreme_Biological)) {
2519             gene_loc.SetPartialStop(true, eExtreme_Biological);
2520         }
2521     }
2522 
2523     ///
2524     /// set gene partialness if CDS is partial
2525     if (gene_feat  &&  propagated_feat && !mrna_feat){
2526         CSeq_loc& propagated_loc = propagated_feat->SetLocation();
2527         CSeq_loc& gene_loc = gene_feat->SetLocation();
2528         if (propagated_loc.IsPartialStart(eExtreme_Biological)) {
2529             gene_loc.SetPartialStart(true, eExtreme_Biological);
2530         }
2531         if (propagated_loc.IsPartialStop(eExtreme_Biological)) {
2532             gene_loc.SetPartialStop(true, eExtreme_Biological);
2533         }
2534     }
2535 }
2536 
2537 /// Check whether range1 contains range2
s_Contains(const TSeqRange & range1,const TSeqRange & range2)2538 static inline bool s_Contains(const TSeqRange &range1, const TSeqRange &range2)
2539 {
2540     return range1.GetFrom() <= range2.GetFrom() &&
2541            range1.GetTo() >= range2.GetTo();
2542 }
2543 
2544 void CFeatureGenerator::
RecomputePartialFlags(CSeq_annot & annot)2545 SImplementation::RecomputePartialFlags(CSeq_annot& annot)
2546 {
2547     CScope scope(*CObjectManager::GetInstance());
2548     CSeq_annot_Handle sah = scope.AddSeq_annot(annot);
2549 
2550     /// We're going to recalculate Partial flags for all features,
2551     /// and fuzzy ends for gene features; reset them if they're currently set
2552     for(CFeat_CI ci(sah); ci; ++ci){
2553         CSeq_feat_EditHandle handle(*ci);
2554         CRef<CSeq_feat> feat(const_cast<CSeq_feat*>(handle.GetSeq_feat().GetPointer()));
2555         feat->ResetPartial();
2556         if(feat->GetData().IsGene()){
2557             feat->SetLocation().SetPartialStart(false, eExtreme_Biological);
2558             feat->SetLocation().SetPartialStop(false, eExtreme_Biological);
2559         }
2560     }
2561 
2562     feature::CFeatTree tree(sah);
2563     vector<CMappedFeat> top_level_features = tree.GetChildren(CMappedFeat());
2564 
2565     /// Sort top features (i.e. Seq_feat objects with no parent) by type
2566     vector< vector<CMappedFeat> > top_level_features_by_type;
2567     top_level_features_by_type.resize(CSeqFeatData::e_MaxChoice);
2568 
2569     ITERATE(vector<CMappedFeat>, it, top_level_features)
2570         top_level_features_by_type[it->GetData().Which()].push_back(*it);
2571 
2572     /// Add null gene and rna features; this makes the programming easier for
2573     /// dealing with top-level rnas and top-level cd regions
2574     top_level_features_by_type[CSeqFeatData::e_Gene].push_back(CMappedFeat());
2575     top_level_features_by_type[CSeqFeatData::e_Rna].push_back(CMappedFeat());
2576 
2577     ITERATE(vector<CMappedFeat>, gene_it,
2578             top_level_features_by_type[CSeqFeatData::e_Gene])
2579     {
2580         CRef<CSeq_feat> gene_feat;
2581         if(*gene_it){
2582             CSeq_feat_EditHandle gene_handle(*gene_it);
2583             gene_feat.Reset(const_cast<CSeq_feat*>(gene_handle.GetSeq_feat().GetPointer()));
2584         }
2585         // Get gene's children; or, if we've reached the sentinel null gene
2586         // feature, get top-level rnas.
2587         vector<CMappedFeat> gene_children =
2588                 gene_feat ? tree.GetChildren(*gene_it)
2589                           : top_level_features_by_type[CSeqFeatData::e_Rna];
2590         sort(gene_children.begin(), gene_children.end());
2591 
2592         ITERATE(vector<CMappedFeat>, child_it, gene_children){
2593             CRef<CSeq_feat> child_feat;
2594             if(*child_it){
2595                 CSeq_feat_EditHandle child_handle(*child_it);
2596                 child_feat.Reset(const_cast<CSeq_feat*>(child_handle.GetSeq_feat().GetPointer()));
2597             }
2598             if(child_feat && child_feat->GetData().IsCdregion()){
2599                 // We have gene and cds with no RNA feature
2600                 SetPartialFlags(gene_feat, CRef<CSeq_feat>(), child_feat);
2601             } else if(!child_feat || child_feat->GetData().IsRna()){
2602                 vector<CMappedFeat> rna_children =
2603                     child_feat ? tree.GetChildren(*child_it)
2604                                : top_level_features_by_type[CSeqFeatData::e_Cdregion];
2605                 /// When propagating a ncRNA feature, the propagated feature will have a range
2606                 /// contained within the range of the newly-created RNA feature, and should be
2607                 /// treated as its child. Unfortunately the logic of CFeatTree does not recognize
2608                 /// one RNA feature as a child of the other, so we need to do that manually.
2609                 while((child_it+1) != gene_children.end() &&
2610                       (child_it+1)->GetData().GetSubtype() == CSeqFeatData::eSubtype_ncRNA &&
2611                       s_Contains(child_feat->GetLocation().GetTotalRange(),
2612                                  (child_it+1)->GetTotalRange())){
2613                     rna_children.push_back(*(++child_it));
2614                 }
2615                 if(rna_children.empty()){
2616                     // We have gene and RNA with no cds feature
2617                     SetPartialFlags(gene_feat, child_feat, CRef<CSeq_feat>());
2618                 } else {
2619                     ITERATE(vector<CMappedFeat>, rna_child_it, rna_children){
2620                         CRef<CSeq_feat> rna_child_feat;
2621                         CSeq_feat_EditHandle rna_child_handle(*rna_child_it);
2622                         rna_child_feat.Reset(const_cast<CSeq_feat*>(rna_child_handle.GetSeq_feat().GetPointer()));
2623                         SetPartialFlags(gene_feat, child_feat, rna_child_feat);
2624                     }
2625                 }
2626             }
2627         }
2628     }
2629 }
2630 
2631 
x_CheckInconsistentDbxrefs(CConstRef<CSeq_feat> gene_feat,CConstRef<CSeq_feat> propagated_feature)2632 void CFeatureGenerator::SImplementation::x_CheckInconsistentDbxrefs(
2633                        CConstRef<CSeq_feat> gene_feat,
2634                        CConstRef<CSeq_feat> propagated_feature)
2635 {
2636     if(!gene_feat || !gene_feat->IsSetDbxref() ||
2637        !propagated_feature || !propagated_feature->IsSetDbxref())
2638         return;
2639 
2640     ITERATE (CSeq_feat::TDbxref, gene_xref_it, gene_feat->GetDbxref())
2641     /// Special case for miRBase; the gene feature and propagated ncRNA features can
2642     /// legitimately have different tags for it
2643     if((*gene_xref_it)->GetDb() != "miRBase")
2644         ITERATE (CSeq_feat::TDbxref, propagated_xref_it, propagated_feature->GetDbxref())
2645             if((*gene_xref_it)->GetDb() == (*propagated_xref_it)->GetDb() &&
2646                !(*gene_xref_it)->Match(**propagated_xref_it))
2647             {
2648                 string propagated_feature_desc;
2649                 if(propagated_feature->GetData().IsCdregion())
2650                     propagated_feature_desc = "corresponding cdregion";
2651                 else {
2652                     NCBI_ASSERT(propagated_feature->GetData().GetSubtype() == CSeqFeatData::eSubtype_ncRNA,
2653                                 "Unexpected propagated feature type");
2654                     propagated_feature_desc = "propagated ncRNA feature";
2655                 }
2656                 if(propagated_feature->CanGetProduct())
2657                     propagated_feature_desc += " " + propagated_feature->GetProduct().GetId()->AsFastaString();
2658                 ERR_POST(Warning << "Features for gene "
2659                                  << gene_feat->GetLocation().GetId()->AsFastaString()
2660                                  << " and " << propagated_feature_desc
2661                                  << " have " << (*gene_xref_it)->GetDb()
2662                                  << " dbxrefs with inconsistent tags");
2663             }
2664 }
2665 
2666 
2667 
x_CopyAdditionalFeatures(const CBioseq_Handle & handle,SMapper & mapper,CSeq_annot & annot)2668 void CFeatureGenerator::SImplementation::x_CopyAdditionalFeatures(const CBioseq_Handle& handle, SMapper& mapper, CSeq_annot& annot)
2669 {
2670     if (m_flags & fPromoteAllFeatures) {
2671         SAnnotSelector sel;
2672         sel.SetResolveAll()
2673             .SetAdaptiveDepth(true)
2674             .ExcludeFeatSubtype(CSeqFeatData::eSubtype_gene)
2675             .ExcludeFeatSubtype(CSeqFeatData::eSubtype_mRNA)
2676             .ExcludeFeatSubtype(CSeqFeatData::eSubtype_cdregion);
2677         for (CFeat_CI feat_iter(handle, sel);  feat_iter;  ++feat_iter) {
2678             CRef<CSeq_feat> feat(new CSeq_feat());
2679             feat->Assign(feat_iter->GetOriginalFeature());
2680             CRef<CSeq_loc> new_loc =
2681                 mapper.Map(feat_iter->GetLocation());
2682             feat->SetLocation(*new_loc);
2683 
2684             _ASSERT(feat->GetData().Which() != CSeqFeatData::e_not_set);
2685 
2686             annot.SetData().SetFtable().push_back(feat);
2687         }
2688     }
2689 }
2690 
2691 
2692 
2693 //////////////////////////////////////////////////////////////////////////////
2694 
2695 
2696 ///
2697 /// Handle feature exceptions
2698 ///
x_HandleRnaExceptions(CSeq_feat & feat,const CSeq_align * align,CSeq_feat * cds_feat,const CSeq_feat * cds_feat_on_mrna)2699 void CFeatureGenerator::SImplementation::x_HandleRnaExceptions(CSeq_feat& feat,
2700                                   const CSeq_align* align,
2701                                   CSeq_feat* cds_feat,
2702                                   const CSeq_feat* cds_feat_on_mrna)
2703 {
2704     if ( !feat.IsSetProduct() ) {
2705         ///
2706         /// corner case:
2707         /// we may be a CDS feature for an Ig locus
2708         /// check to see if we have an overlapping V/C/D/J/S region
2709         /// we trust only featu-id xrefs here
2710         ///
2711         if (feat.IsSetXref()) {
2712             CBioseq_Handle bsh = m_scope->GetBioseqHandle(*feat.GetLocation().GetId());
2713             const CTSE_Handle& tse = bsh.GetTSE_Handle();
2714 
2715             ITERATE (CSeq_feat::TXref, it, feat.GetXref()) {
2716                 if ( !(*it)->IsSetId() ) {
2717                     continue;
2718                 }
2719 
2720                 CSeq_feat_Handle h;
2721                 const CFeat_id& feat_id = (*it)->GetId();
2722                 if (feat_id.IsLocal()) {
2723                     if (feat_id.GetLocal().IsId()) {
2724                         h = tse.GetFeatureWithId(CSeqFeatData::e_not_set,
2725                                                  feat_id.GetLocal().GetId());
2726                     } else {
2727                         h = tse.GetFeatureWithId(CSeqFeatData::e_not_set,
2728                                                  feat_id.GetLocal().GetStr());
2729                     }
2730                 }
2731 
2732                 if (h) {
2733                     switch (h.GetData().GetSubtype()) {
2734                     case CSeqFeatData::eSubtype_C_region:
2735                     case CSeqFeatData::eSubtype_D_loop:
2736                     case CSeqFeatData::eSubtype_D_segment:
2737                     case CSeqFeatData::eSubtype_J_segment:
2738                     case CSeqFeatData::eSubtype_S_region:
2739                     case CSeqFeatData::eSubtype_V_region:
2740                     case CSeqFeatData::eSubtype_V_segment:
2741                         /// found it
2742                         feat.SetExcept(true);
2743                         feat.SetExcept_text
2744                             ("rearrangement required for product");
2745                         break;
2746 
2747                     default:
2748                         break;
2749                     }
2750                 }
2751             }
2752         }
2753         return;
2754     }
2755 
2756     ///
2757     /// check to see if there is a Spliced-seg alignment
2758     /// if there is, and it corresponds to this feature, we should use it to
2759     /// record our exceptions
2760     ///
2761 
2762     CConstRef<CSeq_align> al;
2763     if (align  &&  align->GetSegs().IsSpliced()) {
2764         al.Reset(align);
2765     }
2766     if ( !al ) {
2767         SAnnotSelector sel;
2768         sel.SetResolveAll();
2769         CAlign_CI align_iter(*m_scope, feat.GetLocation(), sel);
2770         for ( ;  align_iter;  ++align_iter) {
2771             const CSeq_align& this_align = *align_iter;
2772             if (this_align.GetSegs().IsSpliced()  &&
2773                 sequence::IsSameBioseq
2774                 (sequence::GetId(feat.GetProduct(), m_scope.GetPointer()),
2775                  this_align.GetSeq_id(0),
2776                  m_scope.GetPointer())) {
2777                 al.Reset(&this_align);
2778                 break;
2779             }
2780         }
2781     }
2782 
2783     bool has_length_mismatch = false;
2784     //bool has_polya_tail = false;
2785     bool has_incomplete_polya_tail = false;
2786     bool partial_unaligned_section = false;
2787     CRangeCollection<TSeqPos> mismatch_locs;
2788     CRangeCollection<TSeqPos> insert_locs;
2789     CRangeCollection<TSeqPos> delete_locs;
2790     map<TSeqPos,TSeqPos> delete_sizes;
2791 
2792     CBioseq_Handle prod_bsh    = m_scope->GetBioseqHandle(*feat.GetProduct().GetId());
2793     if ( !prod_bsh ) {
2794         /// Product doesn't exist (will happen for fake transcript when handling
2795         /// protein alignments); no basis for creating exceptions
2796         return;
2797     }
2798 
2799     TSeqPos loc_len = sequence::GetLength(feat.GetLocation(), m_scope.GetPointer());
2800     if (loc_len > prod_bsh.GetBioseqLength()) {
2801         has_length_mismatch = true;
2802     }
2803 
2804     if (al && al->GetSegs().GetSpliced().GetProduct_type()==CSpliced_seg::eProduct_type_transcript) {
2805         ///
2806         /// can do full comparison
2807         ///
2808 
2809         /// we know we have a Spliced-seg
2810         /// evaluate for gaps or mismatches
2811         TSeqPos prev_to = 0;
2812         ITERATE (CSpliced_seg::TExons, exon_it,
2813                  al->GetSegs().GetSpliced().GetExons()) {
2814             const CSpliced_exon& exon = **exon_it;
2815             TSeqPos pos = exon.GetProduct_start().GetNucpos();
2816             if (exon_it != al->GetSegs().GetSpliced().GetExons().begin()) {
2817                 TSeqRange gap(prev_to+1, pos-1);
2818                 if (gap.NotEmpty()) {
2819                     if (feat.IsSetPartial()) {
2820                         partial_unaligned_section = true;
2821                     } else {
2822                         insert_locs += gap;
2823                     }
2824                 }
2825             }
2826             prev_to = exon.GetProduct_end().GetNucpos();
2827             if (exon.IsSetParts()) {
2828                 ITERATE (CSpliced_exon::TParts, part_it, exon.GetParts()) {
2829                     switch ((*part_it)->Which()) {
2830                     case CSpliced_exon_chunk::e_Match:
2831                         pos += (*part_it)->GetMatch();
2832                         break;
2833                     case CSpliced_exon_chunk::e_Mismatch:
2834                         mismatch_locs +=
2835                             TSeqRange(pos, pos+(*part_it)->GetMismatch()-1);
2836                         pos += (*part_it)->GetMismatch();
2837                         break;
2838                     case CSpliced_exon_chunk::e_Diag:
2839                         pos += (*part_it)->GetDiag();
2840                         break;
2841                     case CSpliced_exon_chunk::e_Genomic_ins:
2842                         delete_locs += TSeqRange(pos, pos);
2843                         delete_sizes[pos] = (*part_it)->GetGenomic_ins();
2844                         break;
2845                     case CSpliced_exon_chunk::e_Product_ins:
2846                         insert_locs +=
2847                             TSeqRange(pos, pos+(*part_it)->GetProduct_ins()-1);
2848                         pos += (*part_it)->GetProduct_ins();
2849                         break;
2850                     default:
2851                         break;
2852                     }
2853                 }
2854             }
2855         }
2856 
2857         /// Check against aligned range - see if there is a 5' or 3'
2858         /// discrepancy
2859         TSeqRange r = al->GetSeqRange(0);
2860         if (r.GetFrom() != 0) {
2861             if (feat.IsSetPartial()) {
2862                 partial_unaligned_section = true;
2863             } else {
2864                 insert_locs += TSeqRange(0, r.GetFrom()-1);
2865             }
2866         }
2867 
2868         TSeqPos max_align_len = 0;
2869         if (al->GetSegs().GetSpliced().IsSetPoly_a()) {
2870             //has_polya_tail = true;
2871             max_align_len = al->GetSegs().GetSpliced().GetPoly_a();
2872         } else if (al->GetSegs().GetSpliced().IsSetProduct_length()) {
2873             max_align_len = al->GetSegs().GetSpliced().GetProduct_length();
2874         } else {
2875             max_align_len = prod_bsh.GetBioseqLength();
2876         }
2877 
2878         if (r.GetTo() + 1 < max_align_len) {
2879             if (feat.IsSetPartial()) {
2880                 partial_unaligned_section = true;
2881             } else {
2882                 insert_locs += TSeqRange(r.GetTo()+1, max_align_len-1);
2883             }
2884         }
2885 
2886         /// also note the poly-A
2887         /**
2888         if (al->GetSegs().GetSpliced().IsSetPoly_a()) {
2889             has_polya_tail = true;
2890         }
2891         **/
2892     }
2893 
2894     if ( insert_locs.empty() && delete_locs.empty() && !partial_unaligned_section)
2895     {
2896         /// only compare for mismatches and 3' unaligned
2897         /// we assume that the feature is otherwise aligned
2898 
2899         CSeqVector nuc_vec(feat.GetLocation(), *m_scope,
2900                            CBioseq_Handle::eCoding_Iupac);
2901 
2902         CSeqVector rna_vec(prod_bsh,
2903                            CBioseq_Handle::eCoding_Iupac);
2904 
2905         CSeqVector::const_iterator prod_it  = rna_vec.begin();
2906         CSeqVector::const_iterator prod_end = rna_vec.end();
2907 
2908         CSeqVector::const_iterator genomic_it  = nuc_vec.begin();
2909         CSeqVector::const_iterator genomic_end = nuc_vec.end();
2910         mismatch_locs.clear();
2911 
2912         for ( ;  prod_it != prod_end  &&  genomic_it != genomic_end;
2913               ++prod_it, ++genomic_it) {
2914             if (*prod_it != *genomic_it) {
2915                 mismatch_locs += TSeqRange(prod_it.GetPos(), prod_it.GetPos());
2916             }
2917         }
2918 
2919         size_t tail_len = prod_end - prod_it;
2920         size_t count_a = 0;
2921         for ( ;  prod_it != prod_end;  ++prod_it) {
2922             if (*prod_it == 'A') {
2923                 ++count_a;
2924             }
2925         }
2926 
2927         if (tail_len  &&  count_a >= tail_len * 0.8) {
2928             //has_polya_tail = true;
2929             if (count_a < tail_len * 0.95) {
2930                 has_incomplete_polya_tail = true;
2931             }
2932         }
2933         else if (tail_len) {
2934             if (feat.IsSetPartial()) {
2935                 partial_unaligned_section = true;
2936             } else {
2937                 TSeqPos end_pos = feat.GetLocation().GetTotalRange().GetTo();
2938                 insert_locs += TSeqRange(end_pos-tail_len+1, end_pos);
2939             }
2940         }
2941     }
2942 
2943     string except_text;
2944     if (!insert_locs.empty() ||
2945         !delete_locs.empty() ||
2946         has_length_mismatch  ||
2947         has_incomplete_polya_tail ||
2948         partial_unaligned_section) {
2949         except_text = "unclassified transcription discrepancy";
2950     }
2951     else if (!mismatch_locs.empty()) {
2952         except_text = "mismatches in transcription";
2953     }
2954 
2955     x_SetExceptText(feat, except_text);
2956     x_SetComment(feat, cds_feat, cds_feat_on_mrna, align, mismatch_locs,
2957                  insert_locs, delete_locs, delete_sizes,
2958                  partial_unaligned_section);
2959 }
2960 
s_MapSingleAA(TSeqPos pos,CRef<CSeq_id> mapped_protein_id,const CRangeCollection<TSeqPos> & product_ranges,CRef<CSeq_loc_Mapper> to_mrna,CRef<CSeq_loc_Mapper> to_genomic)2961 static CRef<CSeq_loc> s_MapSingleAA(
2962     TSeqPos pos, CRef<CSeq_id> mapped_protein_id,
2963     const CRangeCollection<TSeqPos> &product_ranges,
2964     CRef<CSeq_loc_Mapper> to_mrna, CRef<CSeq_loc_Mapper> to_genomic)
2965 {
2966     CRef<CSeq_loc> mapped;
2967     if (to_mrna) {
2968         ITERATE (CRangeCollection<TSeqPos>, range_it, product_ranges) {
2969             if (range_it->GetLength() > pos) {
2970                 pos += range_it->GetFrom();
2971                 break;
2972             } else {
2973                 pos -= range_it->GetLength();
2974             }
2975         }
2976         CSeq_loc base_loc(*mapped_protein_id, pos, pos);
2977         CRef<CSeq_loc> mrna_loc = to_mrna->Map(base_loc);
2978         mapped = to_genomic->Map(*mrna_loc);
2979         mapped->SetPartialStart(false, eExtreme_Biological);
2980         mapped->SetPartialStop(false, eExtreme_Biological);
2981     }
2982     return mapped;
2983 }
2984 
2985 
x_HandleCdsExceptions(CSeq_feat & feat,const CSeq_align * align,const CSeq_feat * cds_feat_on_query_mrna,const CSeq_feat * cds_feat_on_transcribed_mrna,list<CRef<CSeq_loc>> * transcribed_mrna_seqloc_refs,TSeqPos * clean_match_count)2986 void CFeatureGenerator::SImplementation::x_HandleCdsExceptions(CSeq_feat& feat,
2987                                   const CSeq_align* align,
2988                                   const CSeq_feat* cds_feat_on_query_mrna,
2989 				  const CSeq_feat* cds_feat_on_transcribed_mrna,
2990                                   list<CRef<CSeq_loc> >* transcribed_mrna_seqloc_refs,
2991                                   TSeqPos *clean_match_count)
2992 {
2993     if ( !feat.IsSetProduct()
2994          || ( cds_feat_on_query_mrna && !cds_feat_on_query_mrna->IsSetProduct() )
2995          ) {
2996         ///
2997         /// corner case:
2998         /// we may be a CDS feature for an Ig locus
2999         /// check to see if we have an overlapping V/C/D/J/S region
3000         /// we trust only featu-id xrefs here
3001         ///
3002         if (feat.IsSetXref()) {
3003             CBioseq_Handle bsh = m_scope->GetBioseqHandle(*feat.GetLocation().GetId());
3004             const CTSE_Handle& tse = bsh.GetTSE_Handle();
3005 
3006             ITERATE (CSeq_feat::TXref, it, feat.GetXref()) {
3007                 if ( !(*it)->IsSetId() ) {
3008                     continue;
3009                 }
3010 
3011                 CSeq_feat_Handle h;
3012                 const CFeat_id& feat_id = (*it)->GetId();
3013                 if (feat_id.IsLocal()) {
3014                     if (feat_id.GetLocal().IsId()) {
3015                         h = tse.GetFeatureWithId(CSeqFeatData::e_not_set,
3016                                                  feat_id.GetLocal().GetId());
3017                     } else {
3018                         h = tse.GetFeatureWithId(CSeqFeatData::e_not_set,
3019                                                  feat_id.GetLocal().GetStr());
3020                     }
3021                 }
3022 
3023                 if (h) {
3024                     switch (h.GetData().GetSubtype()) {
3025                     case CSeqFeatData::eSubtype_C_region:
3026                     case CSeqFeatData::eSubtype_D_loop:
3027                     case CSeqFeatData::eSubtype_D_segment:
3028                     case CSeqFeatData::eSubtype_J_segment:
3029                     case CSeqFeatData::eSubtype_S_region:
3030                     case CSeqFeatData::eSubtype_V_region:
3031                     case CSeqFeatData::eSubtype_V_segment:
3032                         /// found it
3033                         feat.SetExcept(true);
3034                         feat.SetExcept_text
3035                             ("rearrangement required for product");
3036                         break;
3037 
3038                     default:
3039                         break;
3040                     }
3041                 }
3042             }
3043         }
3044         return;
3045     }
3046 
3047     ///
3048     /// exceptions here are easy:
3049     /// we compare the annotated product to the conceptual translation and
3050     /// report problems
3051     ///
3052     bool has_start         = false;
3053     bool has_stop          = false;
3054     TSeqPos mismatch_count = 0;
3055     bool has_gap           = false;
3056     bool has_indel         = false;
3057 
3058     string xlate;
3059 
3060     CRef<CSeq_feat> corrected_cds_feat_on_query_mrna;
3061     CRef<CSeq_feat> corrected_cds_feat_on_transcribed_mrna;
3062     if (cds_feat_on_query_mrna) {
3063         /// In some cases, the id in the CDS feature is not the same as in the
3064         /// alignment; make sure the mapping is done with matching ids
3065 
3066         corrected_cds_feat_on_query_mrna.Reset(new CSeq_feat);
3067         corrected_cds_feat_on_query_mrna->Assign(*cds_feat_on_query_mrna);
3068         corrected_cds_feat_on_query_mrna->SetLocation().SetId(align->GetSeq_id(0));
3069 
3070         corrected_cds_feat_on_transcribed_mrna.Reset(new CSeq_feat);
3071         corrected_cds_feat_on_transcribed_mrna->Assign(*cds_feat_on_transcribed_mrna);
3072         corrected_cds_feat_on_transcribed_mrna->SetLocation().SetId(align->GetSeq_id(0));
3073     }
3074 
3075     int cds_start_on_mrna = 0;
3076     int frame_on_mrna = 0;
3077     bool filled_by_polya = false;
3078 
3079     if (align != NULL) {
3080         CBioseq bioseq;
3081         x_CollectMrnaSequence(bioseq.SetInst(), *align, feat.GetLocation(), true, true, &has_gap, &has_indel);
3082 
3083         CSeqVector seq(bioseq, m_scope.GetPointer(),
3084                        CBioseq_Handle::eCoding_Iupac);
3085 
3086         if (feat.GetProduct().GetId()->IsLocal()) {
3087             /// For locally generated product, product is generated from
3088             /// translation so we know it will fit the translation perfectly;
3089             /// only need exceptions if there are frameshifts
3090             if (has_indel) {
3091                 string except_text = "unclassified translation discrepancy";
3092                 x_SetExceptText(feat, except_text);
3093             }
3094             if (clean_match_count) {
3095                 *clean_match_count = seq.size();
3096             }
3097             return;
3098         }
3099 
3100         CRef<CSeq_loc_Mapper> genomic_to_mrna(
3101             new CSeq_loc_Mapper(*align, CSeq_loc_Mapper::eSplicedRow_Prod));
3102 
3103         int cds_len_on_query_mrna = GetLength(feat.GetLocation(), NULL);
3104         int missing_end = 0;
3105         if (cds_feat_on_query_mrna) {
3106             cds_start_on_mrna =
3107                 cds_feat_on_query_mrna->GetLocation().GetStart(eExtreme_Positional);
3108             cds_len_on_query_mrna = GetLength(cds_feat_on_query_mrna->GetLocation(), NULL);
3109             CRef<CSeq_loc> aligned_cds = genomic_to_mrna->Map(feat.GetLocation());
3110             missing_end = cds_feat_on_query_mrna->GetLocation().GetStop(eExtreme_Positional)
3111                         - aligned_cds->GetStop(eExtreme_Positional);
3112 
3113             if (cds_feat_on_query_mrna->GetData().GetCdregion().IsSetFrame()) {
3114                 switch (cds_feat_on_query_mrna->GetData().GetCdregion().GetFrame()) {
3115                 case CCdregion::eFrame_two :
3116                     frame_on_mrna = 1;
3117                     break;
3118                 case CCdregion::eFrame_three :
3119                     frame_on_mrna = 2;
3120                     break;
3121                 default :
3122                     break;
3123                 }
3124             }
3125         }
3126         string mrna;
3127         seq.GetSeqData(cds_start_on_mrna + frame_on_mrna, cds_start_on_mrna + cds_len_on_query_mrna, mrna);
3128         if ((missing_end == 1 || missing_end == 2) &&
3129             !feat.GetLocation().IsPartialStop(eExtreme_Biological) &&
3130             align->GetSegs().IsSpliced() &&
3131             align->GetSegs().GetSpliced().CanGetPoly_a())
3132         {
3133             /// One or two bases at end replaced by poly-a
3134             filled_by_polya = true;
3135             for (size_t pos = mrna.size() - 1 - missing_end;
3136                  pos < mrna.size(); ++pos)
3137             {
3138                 mrna[pos] = 'A';
3139             }
3140         }
3141 
3142         const CGenetic_code *code = NULL;
3143         if (feat.GetData().GetCdregion().IsSetCode()) {
3144             code = &feat.GetData().GetCdregion().GetCode();
3145         }
3146         bool partial_start = feat.GetLocation().IsPartialStart(eExtreme_Biological);
3147         CSeqTranslator::Translate(mrna, xlate,
3148                                   partial_start
3149                                   ? CSeqTranslator::fIs5PrimePartial
3150                                   : CSeqTranslator::fDefault,
3151                                   code);
3152         if (xlate.size() && xlate[0] == '-') {
3153             /// First codon couldn't be translated as initial codon; translate
3154             /// as mid-sequence codon instead
3155             string first_codon = mrna.substr(0,3);
3156             string first_aa;
3157             CSeqTranslator::Translate(first_codon, first_aa,
3158                                       CSeqTranslator::fIs5PrimePartial, code);
3159             xlate[0] = first_aa[0];
3160         }
3161 
3162         // deal with code breaks
3163         // NB: these should be folded into the translation machinery instead...
3164         if (feat.GetData().GetCdregion().IsSetCode_break() && corrected_cds_feat_on_transcribed_mrna)
3165         {
3166             ITERATE (CCdregion::TCode_break, it,
3167                      feat.GetData().GetCdregion().GetCode_break()) {
3168 
3169                 const CSeq_loc& cb_on_genome = (*it)->GetLoc();
3170                 CRef<CSeq_loc> cb_on_mrna = genomic_to_mrna->Map(cb_on_genome);
3171                 if (!cb_on_mrna) continue;
3172 
3173                 TSeqRange r = cb_on_mrna->GetTotalRange();
3174                 if (r.GetLength() != 3) {
3175                     continue;
3176                 }
3177 
3178                 int pos = (cb_on_mrna->GetStart(eExtreme_Biological)-(cds_start_on_mrna+frame_on_mrna))/3;
3179 
3180                 string src;
3181                 CSeqUtil::ECoding src_coding = CSeqUtil::e_Ncbieaa;
3182 
3183                 switch ((*it)->GetAa().Which()) {
3184                 case CCode_break::TAa::e_Ncbieaa:
3185                     src += (char)(*it)->GetAa().GetNcbieaa();
3186                     src_coding = CSeqUtil::e_Ncbieaa;
3187                     break;
3188 
3189                 case CCode_break::TAa::e_Ncbistdaa:
3190                     src += (char)(*it)->GetAa().GetNcbistdaa();
3191                     src_coding = CSeqUtil::e_Ncbistdaa;
3192                     break;
3193 
3194                 case CCode_break::TAa::e_Ncbi8aa:
3195                     src += (char)(*it)->GetAa().GetNcbi8aa();
3196                     src_coding = CSeqUtil::e_Ncbi8aa;
3197                     break;
3198 
3199                 default:
3200                     break;
3201                 }
3202 
3203                 if (src.size()) {
3204                     string dst;
3205                     CSeqConvert::Convert(src, src_coding, 0, 1,
3206                                          dst, CSeqUtil::e_Ncbieaa);
3207                     xlate[pos] = dst[0];
3208                 }
3209             }
3210         }
3211     } else {
3212         CSeqTranslator::Translate(feat, *m_scope, xlate);
3213     }
3214 
3215     CRef<CSeq_loc_Mapper> to_prot;
3216     CRef<CSeq_loc_Mapper> to_mrna;
3217     CRef<CSeq_loc_Mapper> to_genomic;
3218     CRef<CSeq_id> mapped_protein_id;
3219     if (corrected_cds_feat_on_transcribed_mrna) {
3220         to_prot.Reset(
3221             new CSeq_loc_Mapper(*corrected_cds_feat_on_query_mrna,
3222                                 CSeq_loc_Mapper::eLocationToProduct));
3223         to_mrna.Reset(
3224             new CSeq_loc_Mapper(*corrected_cds_feat_on_query_mrna,
3225                                 CSeq_loc_Mapper::eProductToLocation));
3226         to_genomic.Reset(
3227             new CSeq_loc_Mapper(*align, CSeq_loc_Mapper::eSplicedRow_Gen));
3228         mapped_protein_id.Reset(new CSeq_id);
3229         mapped_protein_id->Assign(*corrected_cds_feat_on_query_mrna->GetProduct().GetId());
3230     }
3231 
3232     CRef<CSeq_id> cds_id(new CSeq_id);
3233     cds_id->Assign(*feat.GetProduct().GetId());
3234 
3235     string actual;
3236     CRef<CSeq_loc> whole_product(new CSeq_loc);
3237     whole_product->SetWhole(*cds_id);
3238     CSeqVector vec(*whole_product, *m_scope,
3239                    CBioseq_Handle::eCoding_Iupac);
3240     CRangeCollection<TSeqPos> product_ranges(TSeqRange::GetWhole());
3241     if (cds_feat_on_transcribed_mrna) {
3242         /// make sure we're comparing to aligned part of product
3243 
3244         CSeq_loc cds_feat_on_transcribed_mrna_loc;
3245         cds_feat_on_transcribed_mrna_loc.Assign(corrected_cds_feat_on_transcribed_mrna->GetLocation());
3246         if (cds_feat_on_transcribed_mrna_loc.GetStrand() == eNa_strand_minus) {
3247             cds_feat_on_transcribed_mrna_loc.FlipStrand();
3248         }
3249 
3250         CRef<CSeq_loc> aligned_range =
3251             align->CreateRowSeq_loc(0)->Intersect(cds_feat_on_transcribed_mrna_loc, 0, NULL);
3252         CRef<CSeq_loc> product_loc = to_prot->Map(*aligned_range);
3253         product_ranges.clear();
3254         ITERATE (CSeq_loc, loc_it, *product_loc) {
3255             product_ranges += loc_it.GetRange();
3256         }
3257 
3258 
3259         if (!corrected_cds_feat_on_transcribed_mrna->GetLocation().IsPartialStop(eExtreme_Biological) &&
3260             aligned_range->Intersect(corrected_cds_feat_on_transcribed_mrna->GetLocation(), 0, NULL)->GetStop(eExtreme_Biological) ==
3261             corrected_cds_feat_on_transcribed_mrna->GetLocation().GetStop(eExtreme_Biological)) {
3262             // trim off the stop
3263             product_ranges -= TSeqRange(product_ranges.GetTo(),
3264                                         product_ranges.GetTo());
3265         }
3266     }
3267 
3268     if ((xlate.size() == product_ranges.GetTo() + (filled_by_polya ? 1 : 2)  ||
3269          product_ranges.GetTo() == TSeqRange::GetWholeTo()) &&
3270         xlate[xlate.size() - 1] == '*')
3271     { /// strip a terminal stop
3272         xlate.resize(xlate.size() - 1);
3273         has_stop = true;
3274     }
3275     else if (feat.GetLocation().IsPartialStop(eExtreme_Biological)) {
3276         has_stop = true;
3277     } else {
3278         has_stop = false;
3279     }
3280 
3281     if ( (product_ranges.GetFrom()==0 && xlate.size()  &&  xlate[0] == 'M')  ||
3282          feat.GetLocation().IsPartialStart(eExtreme_Biological) ) {
3283         has_start = true;
3284     }
3285 
3286     if (product_ranges.Empty()) {
3287         has_gap = true;
3288     } else {
3289         string whole;
3290 	vec.GetSeqData(0, vec.size(), whole);
3291         if (product_ranges[0].IsWhole()) {
3292             actual = whole;
3293         } else {
3294 	    string xlate_trimmed;
3295             ITERATE (CRangeCollection<TSeqPos>, range_it, product_ranges) {
3296                 actual += whole.substr(range_it->GetFrom(), range_it->GetLength());
3297                 xlate_trimmed += xlate.substr(range_it->GetFrom(), range_it->GetLength());
3298             }
3299             xlate = xlate_trimmed;
3300         }
3301         if (actual != whole) {
3302             has_gap = true;
3303         }
3304     }
3305 
3306     ///
3307     /// now, compare the two
3308     /// we deliberately look for problems here rather than using a direct
3309     /// string compare
3310     /// NB: we could actually compare lengths first; a length difference imples
3311     /// the unclassified translation discrepancy state, but we may expand these
3312     /// states in the future, so it's better to be explicit about our data
3313     /// recording first
3314     ///
3315 
3316     string::const_iterator it1     = actual.begin();
3317     string::const_iterator it1_end = actual.end();
3318     string::const_iterator it2     = xlate.begin();
3319     string::const_iterator it2_end = xlate.end();
3320 
3321     for ( ;  it1 != it1_end  &&  it2 != it2_end;  ++it1, ++it2) {
3322         CRef<CSeq_loc> mapped = s_MapSingleAA(it1 - actual.begin(),
3323              mapped_protein_id, product_ranges, to_mrna, to_genomic);
3324         CRef<CCode_break> code_break;
3325         if (mapped && feat.GetData().GetCdregion().IsSetCode_break()) {
3326             if (!mapped->IsInt()) {
3327                 mapped->ChangeToPackedInt();
3328             }
3329             NON_CONST_ITERATE (CCdregion::TCode_break, it, feat.SetData().SetCdregion().SetCode_break()) {
3330                 CCode_break & cb = **it;
3331                 if (cb.GetLoc().Compare(*mapped, CSeq_loc::fCompare_Strand)==0) {
3332                     code_break = *it;
3333                     break;
3334                 }
3335             }
3336         }
3337         if ((m_flags & fTrustProteinSeq) && *it2 == 'X' && code_break) {
3338             /// internal stop codon; change the code-break to actual aa
3339 
3340             if ((m_flags & fForceTranslateCds)) {
3341                 // it's too late to change generated protein
3342                 NCBI_THROW(CException, eUnknown,
3343                            "fTrustProteinSeq & fForceTranslateCds combination not implemented");
3344             }
3345 
3346             char actual_aa = *it1;
3347             code_break->SetAa().SetNcbieaa(actual_aa);
3348 
3349         } else if (*it2 == '-' || *it2 == '*') {
3350             has_gap = true;
3351         } else if (*it1 != *it2) {
3352             ++mismatch_count;
3353         } else if (clean_match_count && (!mapped ||
3354                 (mapped->IsInt() && mapped->GetTotalRange().GetLength() == 3)))
3355         {
3356             ++*clean_match_count;
3357         }
3358     }
3359 
3360     if (has_stop && filled_by_polya) {
3361         CRef<CSeq_loc> mapped = s_MapSingleAA(xlate.size(), mapped_protein_id,
3362                                        product_ranges, to_mrna, to_genomic);
3363         if (mapped) {
3364             AddCodeBreak(feat, *mapped, '*');
3365             if (feat.IsSetComment()) {
3366                 feat.SetComment() += "; ";
3367             } else {
3368                 feat.SetComment("");
3369             }
3370             feat.SetComment() += "stop codon completed by the addition of "
3371                                  "3' A residues to the mRNA";
3372         } else {
3373             has_stop = false;
3374         }
3375     }
3376 
3377     string except_text;
3378 
3379     /// The process for setting the comment in some cases finds indels that our
3380     /// process here misses, so check the comment to determine if we have indels
3381     if (feat.IsSetComment() &&
3382         (feat.GetComment().find("indel") != string::npos ||
3383          feat.GetComment().find("inserted") != string::npos ||
3384          feat.GetComment().find("deleted") != string::npos))
3385     {
3386         has_indel = true;
3387     }
3388 
3389     if (actual.size() != xlate.size()  ||
3390         !has_stop  ||  !has_start  ||
3391         has_gap || has_indel) {
3392         except_text = "unclassified translation discrepancy";
3393     }
3394     else if (mismatch_count) {
3395         except_text = "mismatches in translation";
3396     }
3397 
3398     x_SetExceptText(feat, except_text);
3399 }
3400 
x_SetExceptText(CSeq_feat & feat,const string & text)3401 void CFeatureGenerator::SImplementation::x_SetExceptText(
3402       CSeq_feat& feat, const string &text)
3403 {
3404     string except_text = text;
3405 
3406     list<string> except_toks;
3407     if (feat.IsSetExcept_text()) {
3408         NStr::Split(feat.GetExcept_text(), ",", except_toks, NStr::fSplit_Tokenize);
3409 
3410         for (list<string>::iterator it = except_toks.begin();
3411              it != except_toks.end();  ) {
3412             NStr::TruncateSpacesInPlace(*it);
3413             if (it->empty()  ||
3414                 *it == "annotated by transcript or proteomic data" ||
3415                 *it == "unclassified transcription discrepancy"  ||
3416                 *it == "mismatches in transcription" ||
3417                 *it == "unclassified translation discrepancy"  ||
3418                 *it == "mismatches in translation") {
3419                 except_toks.erase(it++);
3420             }
3421             else {
3422                 ++it;
3423             }
3424         }
3425     }
3426 
3427     if ( !except_text.empty() ) {
3428         /// Check whether this is a Refseq product
3429         CBioseq_Handle bsh = m_scope->GetBioseqHandle(*feat.GetProduct().GetId());
3430         ITERATE(CBioseq_Handle::TId, it, bsh.GetId())
3431         if(it->GetSeqId()->IsOther() &&
3432            it->GetSeqId()->GetOther().GetAccession()[0] == 'N' &&
3433            string("MRP").find(it->GetSeqId()->GetOther().GetAccession()[1]) != string::npos)
3434         {
3435             except_text = "annotated by transcript or proteomic data";
3436 
3437             /// Refseq exception has to be combined with an inference qualifer
3438             string product_type_string;
3439             if(feat.GetData().IsCdregion())
3440                 product_type_string = "AA sequence";
3441             else {
3442                 NCBI_ASSERT(feat.GetData().IsRna(), "Bad feature type");
3443                 product_type_string = "RNA sequence";
3444                 if(feat.GetData().GetRna().CanGetType() &&
3445                    feat.GetData().GetRna().GetType() == CRNA_ref::eType_mRNA)
3446                     product_type_string += ", mRNA";
3447             }
3448             CRef<CGb_qual> qualifier(new CGb_qual);
3449             qualifier->SetQual("inference");
3450             qualifier->SetVal("similar to " + product_type_string + " (same species):RefSeq:" +
3451                               it->GetSeqId()->GetOther().GetAccession() + '.' +
3452                               NStr::IntToString(it->GetSeqId()->GetOther().GetVersion()));
3453             feat.SetQual().push_back(qualifier);
3454         }
3455 
3456         except_toks.push_back(except_text);
3457     }
3458     except_text = NStr::Join(except_toks, ", ");
3459 
3460     if (except_text.empty()) {
3461         // no exception; set states correctly
3462         feat.ResetExcept_text();
3463         feat.ResetExcept();
3464     } else {
3465         feat.SetExcept(true);
3466         feat.SetExcept_text(except_text);
3467     }
3468 }
3469 
x_SetQualForGapFilledModel(CSeq_feat & feat,CSeq_id_Handle id)3470 void CFeatureGenerator::SImplementation::x_SetQualForGapFilledModel(
3471       CSeq_feat& feat, CSeq_id_Handle id)
3472 {
3473     CBioseq_Handle bsh = m_scope->GetBioseqHandle(id);
3474     CSeq_id_Handle best_id = sequence::GetId(id, *m_scope);
3475 
3476     string product_type_string = "RNA sequence";
3477     const CMolInfo *mol_info = s_GetMolInfo(bsh);
3478     if (mol_info && mol_info->CanGetBiomol() &&
3479         mol_info->GetBiomol() == CMolInfo::eBiomol_mRNA) {
3480         product_type_string += ", mRNA";
3481     }
3482 
3483     string db = "INSD";
3484     if(best_id.GetSeqId()->IsOther() &&
3485        best_id.GetSeqId()->GetOther().GetAccession()[0] == 'N' &&
3486        string("MRP").find(best_id.GetSeqId()->GetOther().GetAccession()[1]) != string::npos)
3487         {
3488             db = "RefSeq";
3489         }
3490 
3491     CRef<CGb_qual> qualifier(new CGb_qual);
3492     qualifier->SetQual("inference");
3493     qualifier->SetVal("similar to " + product_type_string + " (same species):"+db+":" +
3494                       best_id.GetSeqId()->GetSeqIdString(true));
3495     feat.SetQual().push_back(qualifier);
3496 
3497 }
3498 
SetFeatureExceptions(CSeq_feat & feat,const CSeq_align * align,CSeq_feat * cds_feat,const CSeq_feat * cds_feat_on_query_mrna,const CSeq_feat * cds_feat_on_transcribed_mrna,list<CRef<CSeq_loc>> * transcribed_mrna_seqloc_refs,TSeqPos * clean_match_count)3499 void CFeatureGenerator::SImplementation::SetFeatureExceptions(CSeq_feat& feat,
3500                                       const CSeq_align* align,
3501                                       CSeq_feat* cds_feat,
3502                                       const CSeq_feat* cds_feat_on_query_mrna,
3503                                       const CSeq_feat* cds_feat_on_transcribed_mrna,
3504                                       list<CRef<CSeq_loc> >* transcribed_mrna_seqloc_refs,
3505                                       TSeqPos *clean_match_count)
3506 {
3507     CConstRef<CSeq_align> align_ref;
3508 
3509     if (align && IsProteinAlign(*align)) {
3510         align_ref.Reset(align);
3511         CRef<CSeq_feat> fake_cds_feat;
3512         TransformProteinAlignToTranscript(align_ref, fake_cds_feat);
3513         align = align_ref.GetPointer();
3514     }
3515 
3516     // We're going to set the exception and add any needed inference qualifiers,
3517     // so if there's already an inference qualifer there, remove it.
3518     if (feat.IsSetQual()) {
3519         for (CSeq_feat::TQual::iterator it = feat.SetQual().begin();
3520             it != feat.SetQual().end();  )
3521         {
3522             if ((*it)->CanGetQual() && (*it)->GetQual() == "inference") {
3523                 it = feat.SetQual().erase(it);
3524             }
3525             else {
3526                 ++it;
3527             }
3528         }
3529         if (feat.GetQual().empty()) {
3530             feat.ResetQual();
3531         }
3532     }
3533 
3534     // Exceptions identified are:
3535     //
3536     //   - unclassified transcription discrepancy
3537     //   - mismatches in transcription
3538     //   - unclassified translation discrepancy
3539     //   - mismatches in translation
3540     switch (feat.GetData().Which()) {
3541     case CSeqFeatData::e_Rna:
3542         x_HandleRnaExceptions(feat, align, cds_feat, cds_feat_on_query_mrna);
3543         break;
3544 
3545     case CSeqFeatData::e_Cdregion:
3546         x_HandleCdsExceptions(feat, align,
3547                               cds_feat_on_query_mrna, cds_feat_on_transcribed_mrna,
3548                               transcribed_mrna_seqloc_refs,
3549                               clean_match_count);
3550         break;
3551 
3552     case CSeqFeatData::e_Imp:
3553         switch (feat.GetData().GetSubtype()) {
3554         case CSeqFeatData::eSubtype_C_region:
3555         case CSeqFeatData::eSubtype_D_loop:
3556         case CSeqFeatData::eSubtype_D_segment:
3557         case CSeqFeatData::eSubtype_J_segment:
3558         case CSeqFeatData::eSubtype_S_region:
3559         case CSeqFeatData::eSubtype_V_region:
3560         case CSeqFeatData::eSubtype_V_segment:
3561             x_HandleRnaExceptions(feat, align, NULL, NULL);
3562             break;
3563 
3564         default:
3565             break;
3566         }
3567         break;
3568 
3569     default:
3570         break;
3571     }
3572 }
3573 
s_Count(unsigned num,const string & item_name)3574 static string s_Count(unsigned num, const string &item_name)
3575 {
3576     return NStr::NumericToString(num) + ' ' + item_name + (num == 1 ? "" : "s");
3577 }
3578 
x_SetComment(CSeq_feat & rna_feat,CSeq_feat * cds_feat,const CSeq_feat * cds_feat_on_mrna,const CSeq_align * align,const CRangeCollection<TSeqPos> & mismatch_locs,const CRangeCollection<TSeqPos> & insert_locs,const CRangeCollection<TSeqPos> & delete_locs,map<TSeqPos,TSeqPos> & delete_sizes,bool partial_unaligned_section)3579 void CFeatureGenerator::SImplementation::x_SetComment(CSeq_feat& rna_feat,
3580                       CSeq_feat *cds_feat,
3581                       const CSeq_feat *cds_feat_on_mrna,
3582                       const CSeq_align *align,
3583                       const CRangeCollection<TSeqPos> &mismatch_locs,
3584                       const CRangeCollection<TSeqPos> &insert_locs,
3585                       const CRangeCollection<TSeqPos> &delete_locs,
3586                       map<TSeqPos,TSeqPos> &delete_sizes,
3587                       bool partial_unaligned_section)
3588 {
3589     if (mismatch_locs.empty() && insert_locs.empty() && delete_locs.empty() &&
3590         !partial_unaligned_section &&
3591         !(m_is_gnomon && cds_feat &&
3592           cds_feat->GetData().GetCdregion().IsSetCode_break()))
3593     {
3594         return;
3595     }
3596 
3597     string rna_comment, cds_comment;
3598     CRangeCollection<TSeqPos> inserts_in_cds, deletes_in_cds, cds_ranges;
3599     if (cds_feat) {
3600         CSeq_loc_Mapper to_mrna(*align, CSeq_loc_Mapper::eSplicedRow_Prod);
3601         for (CSeq_loc_CI loc_it(cds_feat->GetLocation()); loc_it;  ++loc_it) {
3602             CRef<CSeq_loc> cds_on_mrna = to_mrna.Map(*loc_it.GetRangeAsSeq_loc());
3603             inserts_in_cds += cds_on_mrna->GetTotalRange();
3604 	    deletes_in_cds += cds_on_mrna->GetTotalRange();
3605 	}
3606         inserts_in_cds &= insert_locs;
3607         deletes_in_cds &= delete_locs;
3608     }
3609     if (cds_feat_on_mrna) {
3610         for (CSeq_loc_CI loc_it(cds_feat_on_mrna->GetLocation());
3611              loc_it;  ++loc_it)
3612         {
3613             cds_ranges += loc_it.GetRange();
3614         }
3615     }
3616 
3617     CRef<CUser_object> align_info(new CUser_object);
3618     align_info->SetType().SetStr("AlignInfo");
3619 
3620     if (m_is_best_refseq) {
3621         size_t indel_count = insert_locs.size() + delete_locs.size();
3622         size_t frameshift_count = 0;
3623         unsigned pct_coverage = 100, cds_pct_coverage = 100;
3624         if (partial_unaligned_section) {
3625             pct_coverage =
3626                 CScoreBuilderBase().GetPercentCoverage(*m_scope, *align);
3627             cds_pct_coverage =
3628                 CScoreBuilderBase().GetPercentCoverage(*m_scope, *align,
3629                                                        cds_ranges);
3630         }
3631         if (cds_feat && cds_feat_on_mrna) {
3632             size_t cds_indel_count = 0;
3633             ITERATE (CRangeCollection<TSeqPos>, it, inserts_in_cds) {
3634               ++(it->GetLength() % 3 ? frameshift_count : cds_indel_count);
3635             }
3636             ITERATE (CRangeCollection<TSeqPos>, it, deletes_in_cds) {
3637               ++(delete_sizes[it->GetFrom()] % 3 ? frameshift_count
3638                                                  : cds_indel_count);
3639             }
3640             indel_count -= frameshift_count;
3641             size_t cds_mismatch_count = 0;
3642             CSeqVector prot(cds_feat->GetProduct(), *m_scope,
3643                             CBioseq_Handle::eCoding_Iupac);
3644             const CTrans_table &translate =
3645                 CGen_code_table::GetTransTable(1);
3646             CSeq_loc_Mapper to_mrna(*cds_feat_on_mrna,
3647                                     CSeq_loc_Mapper::eProductToLocation);
3648             CSeq_loc_Mapper to_genomic(
3649                 *align, CSeq_loc_Mapper::eSplicedRow_Gen);
3650             CRef<CSeq_id> cds_id(new CSeq_id);
3651             cds_id->Assign(*cds_feat->GetProduct().GetId());
3652             TSeqPos start_pos =
3653                 cds_feat->GetProduct().GetStart(eExtreme_Positional);
3654             bool single_interval_product = ++cds_feat->GetProduct().begin()
3655                                           == cds_feat->GetProduct().end();
3656             if (!single_interval_product) {
3657                 NCBI_THROW(CException, eUnknown,
3658                            "product is required to be a single interval");
3659             }
3660             for (TSeqPos pos = start_pos; pos < start_pos + prot.size(); ++pos)
3661             {
3662                 CSeq_loc aa_loc(*cds_id, pos, pos);
3663                 CRef<CSeq_loc> rna_codon = to_mrna.Map(aa_loc);
3664                 CRef<CSeq_loc> genomic_codon = to_genomic.Map(*rna_codon);
3665                 CSeqVector codon(*genomic_codon, *m_scope,
3666                                  CBioseq_Handle::eCoding_Iupac);
3667                 if (codon.size() == 3) {
3668                     int state = CTrans_table::SetCodonState(
3669                                     codon[0], codon[1], codon[2]);
3670                     char translated_codon = pos == 0
3671                         ? translate.GetStartResidue(state)
3672                         : translate.GetCodonResidue(state);
3673                     if (translated_codon != prot[pos]) {
3674                         ++cds_mismatch_count;
3675                     }
3676                 }
3677             }
3678             if (cds_mismatch_count || cds_indel_count || frameshift_count || cds_pct_coverage < 100)
3679             {
3680                 cds_comment = "The RefSeq protein";
3681                 if (cds_mismatch_count) {
3682                     cds_comment += " has "
3683                                  + s_Count(cds_mismatch_count, "substitution");
3684                 }
3685                 if (frameshift_count) {
3686                     cds_comment += (cds_mismatch_count ? ", " : " has ")
3687                                  + s_Count(frameshift_count, "frameshift");
3688                 }
3689                 if (cds_indel_count) {
3690                     cds_comment += (cds_mismatch_count || frameshift_count ? ", " : " has ")
3691                                  + s_Count(cds_indel_count, "non-frameshifting indel");
3692                 }
3693                 if (cds_pct_coverage < 100) {
3694                     if (cds_mismatch_count || cds_indel_count || frameshift_count) {
3695                         cds_comment += " and";
3696                     }
3697                     cds_comment += " aligns at "
3698                                  + NStr::NumericToString(cds_pct_coverage)
3699                                  + "% coverage";
3700                     }
3701                 cds_comment += " compared to this genomic sequence";
3702             }
3703         }
3704         rna_comment = "The RefSeq transcript";
3705         if (!mismatch_locs.empty()) {
3706             rna_comment += " has " +
3707                 s_Count(mismatch_locs.GetCoveredLength(), "substitution");
3708             align_info->AddField("num_substitions", (int)mismatch_locs.GetCoveredLength());
3709         }
3710         if (frameshift_count) {
3711             rna_comment += (mismatch_locs.empty() ? " has " : ", ") +
3712                            s_Count(frameshift_count, "frameshift");
3713             align_info->AddField("num_frameshifts", (int)frameshift_count);
3714         }
3715         if (indel_count) {
3716             rna_comment += (mismatch_locs.empty() && !frameshift_count? " has " : ", ") +
3717                            s_Count(indel_count, "non-frameshifting indel");
3718             align_info->AddField("num_nonframeshift_indel", (int)indel_count);
3719         }
3720         if (partial_unaligned_section) {
3721             if (!mismatch_locs.empty() || indel_count || frameshift_count) {
3722                 rna_comment += " and";
3723             }
3724             rna_comment += " aligns at "
3725                          + NStr::NumericToString(pct_coverage)
3726                          + "% coverage";
3727         }
3728         if (rna_comment == "The RefSeq transcript") {
3729             rna_comment.clear();
3730         } else {
3731             rna_comment += " compared to this genomic sequence";
3732         }
3733     } else if (m_is_gnomon) {
3734         set<TSeqPos> insert_codons, delete_codons;
3735         TSeqPos inserted_bases = insert_locs.GetCoveredLength(),
3736                 cds_inserted_bases = inserts_in_cds.GetCoveredLength(),
3737                 deleted_bases = 0, cds_deleted_bases = 0,
3738                 code_breaks = 0;
3739         ITERATE (CRangeCollection<TSeqPos>, delete_it, delete_locs) {
3740             NCBI_ASSERT(delete_it->GetLength() == 1,
3741                         "Delete locations should always be one base");
3742             deleted_bases += delete_sizes.find(delete_it->GetFrom())->second;
3743         }
3744         ITERATE (CRangeCollection<TSeqPos>, insert_it, inserts_in_cds) {
3745             for (TSeqPos pos = insert_it->GetFrom();
3746                  pos <= insert_it->GetTo(); ++pos)
3747             {
3748                 insert_codons.insert((pos - cds_ranges.GetFrom()) / 3);
3749             }
3750         }
3751         ITERATE (CRangeCollection<TSeqPos>, delete_it, deletes_in_cds) {
3752             NCBI_ASSERT(delete_it->GetLength() == 1,
3753                         "Delete locations should always be one base");
3754             delete_codons.insert((delete_it->GetFrom() -
3755                                   cds_ranges.GetFrom()) / 3);
3756             cds_deleted_bases +=
3757                 delete_sizes.find(delete_it->GetFrom())->second;
3758         }
3759         if(cds_feat && cds_feat->GetData().GetCdregion().IsSetCode_break()) {
3760             ITERATE (CCdregion::TCode_break, it,
3761                      cds_feat->GetData().GetCdregion().GetCode_break())
3762             {
3763                 char aa = 0;
3764                 switch ((*it)->GetAa().Which()) {
3765                 case CCode_break::TAa::e_Ncbieaa:
3766                     aa = (*it)->GetAa().GetNcbieaa();
3767                     break;
3768 
3769                 case CCode_break::TAa::e_Ncbistdaa:
3770                     {{
3771                         string src_string(1, (*it)->GetAa().GetNcbistdaa()),
3772                                dst_string;
3773                         CSeqConvert::Convert(src_string, CSeqUtil::e_Ncbistdaa,
3774                                              0, 1, dst_string,
3775                                              CSeqUtil::e_Ncbieaa);
3776                         aa = dst_string[0];
3777                     }}
3778                     break;
3779 
3780                 case CCode_break::TAa::e_Ncbi8aa:
3781                     {{
3782                         string src_string(1, (*it)->GetAa().GetNcbi8aa()),
3783                                dst_string;
3784                         CSeqConvert::Convert(src_string, CSeqUtil::e_Ncbi8aa,
3785                                              0, 1, dst_string,
3786                                              CSeqUtil::e_Ncbieaa);
3787                         aa = dst_string[0];
3788                     }}
3789                     break;
3790 
3791                 default:
3792                     break;
3793                 }
3794                 if (aa != 'U') {
3795                     ++code_breaks;
3796                 }
3797             }
3798         }
3799         if (inserted_bases || deleted_bases) {
3800             rna_comment = k_rna_comment;
3801         }
3802         if (inserted_bases) {
3803             rna_comment += ": inserted " + s_Count(inserted_bases, "base")
3804                          + " in " + s_Count(insert_codons.size(), "codon");
3805         }
3806         if (deleted_bases) {
3807             rna_comment += string(NStr::EndsWith(rna_comment,"CDS") ? ":" : ";")
3808                          + " deleted " + s_Count(deleted_bases, "base")
3809                          + " in " + s_Count(delete_codons.size(), "codon");
3810         }
3811         if (cds_inserted_bases || cds_deleted_bases || code_breaks) {
3812             cds_comment = k_cds_comment;
3813         }
3814         if (cds_inserted_bases) {
3815             cds_comment += ": inserted " + s_Count(cds_inserted_bases, "base")
3816                          + " in " + s_Count(insert_codons.size(), "codon");
3817         }
3818         if (cds_deleted_bases) {
3819             cds_comment += string(NStr::EndsWith(cds_comment,"CDS") ? ":" : ";")
3820                          + " deleted " + s_Count(cds_deleted_bases, "base")
3821                          + " in " + s_Count(delete_codons.size(), "codon");
3822         }
3823         if (code_breaks) {
3824             cds_comment += string(NStr::EndsWith(cds_comment,"CDS") ? ":" : ";")
3825                          + " substituted " + s_Count(code_breaks, "base")
3826                          + " at " + s_Count(code_breaks, "genomic stop codon");
3827         }
3828     }
3829 
3830     if (!rna_comment.empty()) {
3831         if (!rna_feat.IsSetComment()) {
3832             rna_feat.SetComment(rna_comment);
3833         /// If comment is already set, check it doesn't already contain our text
3834         } else if (rna_feat.GetComment().find(rna_comment) == string::npos) {
3835             rna_feat.SetComment() += "; " + rna_comment;
3836         }
3837     }
3838     if (!cds_comment.empty()) {
3839         if (!cds_feat->IsSetComment()) {
3840             cds_feat->SetComment(cds_comment);
3841         /// If comment is already set, check it doesn't already contain our text
3842         } else if (cds_feat->GetComment().find(cds_comment) == string::npos) {
3843             cds_feat->SetComment() += "; " + cds_comment;
3844         }
3845     }
3846     if (!align_info->GetData().empty()) {
3847         rna_feat.AddExt(align_info);
3848     }
3849 }
3850 
x_SetCommentForGapFilledModel(CSeq_feat & feat,TSeqPos insert_length)3851 void CFeatureGenerator::SImplementation::x_SetCommentForGapFilledModel
3852 (CSeq_feat& feat,
3853  TSeqPos insert_length)
3854 {
3855     _ASSERT(insert_length > 0);
3856     string comment;
3857     if (feat.GetData().IsRna()) {
3858         comment = k_rna_comment;
3859     } else if (feat.GetData().IsCdregion()) {
3860         comment = k_cds_comment;
3861     }
3862     comment += ":";
3863     if (!feat.IsSetComment()) {
3864         feat.SetComment(comment);
3865     } else if (feat.GetComment().find(comment) == string::npos) {
3866         feat.SetComment() += " " + comment;
3867     } else {
3868         feat.SetComment() += ";";
3869     }
3870 
3871     comment = " added " + s_Count(insert_length, "base") + " not found in genome assembly";
3872     feat.SetComment() += comment;
3873 }
3874 
3875 void CFeatureGenerator::SImplementation::
x_AddSelectMarkup(const CSeq_align & align,const CBioseq_Handle & rna_handle,const CSeq_id & genomic_acc,CSeq_feat & rna_feat,CSeq_feat * cds_feat)3876 x_AddSelectMarkup(const CSeq_align &align,
3877                   const CBioseq_Handle& rna_handle,
3878                   const CSeq_id &genomic_acc,
3879                   CSeq_feat& rna_feat, CSeq_feat* cds_feat)
3880 {
3881     bool need_location_check = !(m_flags & fSkipLocationCheck);
3882     e_MatchType match_found = eNone;
3883     string ensembl_match_rna, ensembl_match_cds;
3884     vector<string> keywords;
3885     bool drop = false;
3886     vector<CSeqdesc::E_Choice> desc_types = {CSeqdesc::e_User,
3887                                              CSeqdesc::e_Genbank};
3888     for (CSeqdesc_CI desc(rna_handle, desc_types); desc; ++desc) {
3889         if (desc->IsGenbank() && desc->GetGenbank().IsSetKeywords()) {
3890             for (const string &keyword : desc->GetGenbank().GetKeywords()) {
3891                 if (m_flags & fDropManeMarkup &&
3892                     (keyword == "MANE Select" || keyword == "MANE Plus"))
3893                 {
3894                     drop = true;
3895                     if (keyword == "MANE Select") {
3896                         keywords.push_back("RefSeq Select");
3897                     }
3898                 } else {
3899                     keywords.push_back(keyword);
3900                 }
3901             }
3902         } else if (desc->IsUser() &&
3903                    desc->GetUser().HasField("MANE Ensembl match"))
3904         {
3905             NStr::SplitInTwo(desc->GetUser().GetField("MANE Ensembl match")
3906                                             .GetString(),
3907                              "/", ensembl_match_rna, ensembl_match_cds);
3908             NStr::TruncateSpacesInPlace(ensembl_match_rna);
3909             NStr::TruncateSpacesInPlace(ensembl_match_cds);
3910         } else if (desc->IsUser() && desc->GetUser().GetType().IsStr() &&
3911                    desc->GetUser().GetType().GetStr() == "RefGeneTracking" &&
3912                    need_location_check)
3913         {
3914             if (desc->GetUser().HasField("EnsemblLocation")) {
3915                 match_found = x_CheckMatch(align, genomic_acc,
3916                                   desc->GetUser().GetField("EnsemblLocation"));
3917             } else if (desc->GetUser().HasField("SelectGeneLocation")) {
3918                 /// SelectGeneLocation is never treated as an exact match
3919                 match_found = min(eOverlap,
3920                                   x_CheckMatch(align, genomic_acc,
3921                                   desc->GetUser().GetField("SelectGeneLocation")));
3922             }
3923         }
3924     }
3925 
3926     if ((match_found >= eOverlap || !need_location_check) && !keywords.empty())
3927     {
3928         /// Found overlap; add uql to features
3929         x_AddKeywordQuals(rna_feat, keywords);
3930         if (cds_feat) {
3931             x_AddKeywordQuals(*cds_feat, keywords);
3932         }
3933     }
3934 
3935     if (match_found == eExact && !drop && !ensembl_match_rna.empty()) {
3936         CRef<CDbtag> rna_ensembl_ref(new CDbtag);
3937         rna_ensembl_ref->SetDb("Ensembl");
3938         rna_ensembl_ref->SetTag().SetStr(ensembl_match_rna);
3939         rna_feat.SetDbxref().push_back(rna_ensembl_ref);
3940         if (cds_feat && !ensembl_match_cds.empty()) {
3941             CRef<CDbtag> cds_ensembl_ref(new CDbtag);
3942             cds_ensembl_ref->SetDb("Ensembl");
3943             cds_ensembl_ref->SetTag().SetStr(ensembl_match_cds);
3944             cds_feat->SetDbxref().push_back(cds_ensembl_ref);
3945         }
3946     }
3947 }
3948 
3949 CFeatureGenerator::SImplementation::e_MatchType
x_CheckMatch(const CSeq_align & align,const CSeq_id & genomic_acc,const CUser_field & loc_field)3950 CFeatureGenerator::SImplementation::x_CheckMatch(const CSeq_align &align,
3951                                                  const CSeq_id &genomic_acc,
3952                                                  const CUser_field &loc_field)
3953 {
3954     if (!loc_field.HasField("seq_id") || !loc_field.HasField("from") ||
3955         !loc_field.HasField("to") || !loc_field.HasField("strand"))
3956     {
3957         NCBI_THROW(CException, eUnknown, loc_field.GetLabel().GetStr()
3958                                        + " doesn't have expected fields");
3959     }
3960 
3961     CSeq_id loc_genomic_acc(loc_field.GetField("seq_id").GetString());
3962     if (loc_genomic_acc.GetTextseq_Id()->GetAccession() ==
3963             genomic_acc.GetTextseq_Id()->GetAccession() &&
3964         loc_genomic_acc.GetTextseq_Id()->GetVersion() >
3965             genomic_acc.GetTextseq_Id()->GetVersion())
3966     {
3967         /// If location is on a newer version then alignment, this is always
3968         /// considered an overlap, regardless of position and strand
3969         return eOverlap;
3970     }
3971 
3972     ENa_strand loc_strand = loc_field.GetField("strand").GetString() == "-"
3973                           ? eNa_strand_minus : eNa_strand_plus;
3974 
3975     /// Otherwise, this is considered a match only if same seq-id and on same
3976     /// strand
3977     if (!loc_genomic_acc.Match(genomic_acc) || loc_strand != align.GetSeqStrand(1))
3978     {
3979         return eNone;
3980     }
3981 
3982     // Considered an overlap if at least 50% of location intersects alignment
3983     TSeqRange loc_range(loc_field.GetField("from").GetInt(),
3984                         loc_field.GetField("to").GetInt());
3985     return loc_range == align.GetSeqRange(1) ? eExact
3986          : (loc_range.IntersectingWith(align.GetSeqRange(1))
3987              ? eOverlap : eNone);
3988 }
3989 
3990 void CFeatureGenerator::SImplementation::
x_AddKeywordQuals(CSeq_feat & feat,const vector<string> & keywords)3991 x_AddKeywordQuals(CSeq_feat &feat, const vector<string> &keywords)
3992 {
3993     for (const string &keyword : keywords) {
3994         CRef<CGb_qual> qualifier(new CGb_qual);
3995         qualifier->SetQual("tag");
3996         qualifier->SetVal(keyword);
3997         feat.SetQual().push_back(qualifier);
3998     }
3999 }
4000 
MergeSeq_locs(const CSeq_loc * loc1,const CSeq_loc * loc2)4001 CRef<CSeq_loc> CFeatureGenerator::SImplementation::MergeSeq_locs(const CSeq_loc* loc1, const CSeq_loc* loc2)
4002 {
4003     CRef<CSeq_loc> merged_loc;
4004 
4005     if (loc1->GetStart(eExtreme_Positional) < loc1->GetStop(eExtreme_Positional) &&
4006         (loc2==NULL ||
4007          loc2->GetStart(eExtreme_Positional) < loc2->GetStop(eExtreme_Positional))) {
4008 
4009         if (loc2==NULL)
4010             merged_loc = loc1->Merge(CSeq_loc::fMerge_SingleRange, NULL);
4011         else
4012             merged_loc = loc1->Add(*loc2, CSeq_loc::fMerge_SingleRange, NULL);
4013     } else {
4014         // cross the origin
4015 
4016         _ASSERT(loc2 == NULL ||
4017                 (loc1->Intersect(*loc2, 0, NULL)->IsNull() == false &&
4018                  loc1->Intersect(*loc2, 0, NULL)->IsEmpty() == false));
4019 
4020         CRef<CSeq_id> id(new CSeq_id);
4021         id->Assign(*loc1->GetId());
4022 
4023         TSeqPos genomic_size = m_scope->GetSequenceLength(*id);
4024         CRef<CSeq_loc> left_loc(new CSeq_loc(*id, genomic_size-1, genomic_size-1, loc1->GetStrand()));
4025         CRef<CSeq_loc> right_loc(new CSeq_loc(*id, 0, 0, loc1->GetStrand()));
4026 
4027         merged_loc = left_loc;
4028         merged_loc->Add(*right_loc);
4029         merged_loc->Add(*loc1);
4030         if (loc2 != NULL)
4031         merged_loc->Add(*loc2);
4032 
4033         TSeqPos x[] = {
4034             loc1->GetStart(eExtreme_Positional),
4035             loc1->GetStop(eExtreme_Positional),
4036             (loc2 ? loc2->GetStart(eExtreme_Positional) : 0),
4037             (loc2 ? loc2->GetStop(eExtreme_Positional) : 0)
4038         };
4039 
4040         if (x[0] > x[1])
4041             x[1] += genomic_size;
4042         if (x[2] > x[3])
4043             x[3] += genomic_size;
4044 
4045         if (x[1] < x[2]) {
4046             x[0] += genomic_size;
4047             x[1] += genomic_size;
4048         } else if (x[3] < x[0]) {
4049             x[2] += genomic_size;
4050             x[3] += genomic_size;
4051         }
4052 
4053 
4054         x[0] = min(x[0], x[2]);
4055         x[1] = max(x[1], x[3]) - genomic_size;
4056         _ASSERT( x[0] > x[1] +1 );
4057 
4058         merged_loc = FixOrderOfCrossTheOriginSeqloc(*merged_loc,
4059                                                     (x[0]+x[1])/2,
4060                                                     CSeq_loc::fMerge_SingleRange);
4061     }
4062     return merged_loc;
4063 }
4064 
FixOrderOfCrossTheOriginSeqloc(const CSeq_loc & loc,TSeqPos outside_point,CSeq_loc::TOpFlags flags)4065 CRef<CSeq_loc> CFeatureGenerator::SImplementation::FixOrderOfCrossTheOriginSeqloc
4066 (const CSeq_loc& loc,
4067  TSeqPos outside_point,
4068  CSeq_loc::TOpFlags flags)
4069 {
4070     CRef<CSeq_id> id(new CSeq_id);
4071     id->Assign(*loc.GetId());
4072 
4073     TSeqPos genomic_size = m_scope->GetSequenceLength(*id);
4074     CRef<CSeq_loc> left_loc(new CSeq_loc);
4075     CRef<CSeq_loc> right_loc(new CSeq_loc);
4076 
4077     ITERATE(CSeq_loc, it, loc) {
4078         if (it.GetRangeAsSeq_loc()->GetStart(eExtreme_Biological) > outside_point)
4079             left_loc->Add(*it.GetRangeAsSeq_loc());
4080         else
4081             right_loc->Add(*it.GetRangeAsSeq_loc());
4082     }
4083 
4084     left_loc = left_loc->Merge(flags, NULL);
4085     right_loc = right_loc->Merge(flags, NULL);
4086 
4087     bool no_gap_at_origin = (left_loc->GetStop(eExtreme_Positional) == genomic_size-1 &&
4088                              right_loc->GetStart(eExtreme_Positional) == 0);
4089 
4090     if (loc.IsReverseStrand()) {
4091         swap(left_loc, right_loc);
4092     }
4093     left_loc->Add(*right_loc);
4094 
4095     if (no_gap_at_origin) {
4096         left_loc->ChangeToPackedInt();
4097         NON_CONST_ITERATE(CPacked_seqint::Tdata, it,left_loc->SetPacked_int().Set()) {
4098             CSeq_interval& interval = **it;
4099             if (interval.GetFrom() == 0) {
4100                 interval.SetFuzz_from().SetLim(CInt_fuzz::eLim_circle);
4101             }
4102             if (interval.GetTo() == genomic_size-1) {
4103                 interval.SetFuzz_to().SetLim(CInt_fuzz::eLim_circle);
4104             }
4105         }
4106     }
4107 
4108     return left_loc;
4109 }
4110 
4111 bool CFeatureGenerator::
HasMixedGenomicIds(const CSeq_align & align)4112 SImplementation::HasMixedGenomicIds(const CSeq_align& align)
4113 {
4114     set<CSeq_id_Handle> genomic_ids;
4115 
4116     if (!align.GetSegs().IsSpliced()) {
4117         return false;
4118     }
4119 
4120     const CSpliced_seg& sps = align.GetSegs().GetSpliced();
4121     if(sps.CanGetGenomic_id())
4122         genomic_ids.insert(CSeq_id_Handle::GetHandle(sps.GetGenomic_id()));
4123 
4124     const CSpliced_seg& spliced_seg = align.GetSegs().GetSpliced();
4125     ITERATE(CSpliced_seg::TExons, it, spliced_seg.GetExons()) {
4126         const CSpliced_exon& exon = **it;
4127         if (exon.CanGetGenomic_id()) {
4128             genomic_ids.insert(CSeq_id_Handle::GetHandle(exon.GetGenomic_id()));
4129         }
4130     }
4131 
4132     return genomic_ids.size() > 1;
4133 }
4134 
AddInsertWithGaps(CRef<CSeq_loc> & edited_sequence_seqloc,CSeq_id & genomic_seqid,int & region_begin,int & region_end,int & offset,CRef<CSeq_loc> & insert,const int k_gap_length,const int next_exon_start)4135 void AddInsertWithGaps(
4136                        CRef<CSeq_loc>& edited_sequence_seqloc,
4137                        CSeq_id& genomic_seqid,
4138                        int& region_begin,
4139                        int& region_end,
4140                        int& offset,
4141                        CRef<CSeq_loc>& insert,
4142                        const int k_gap_length,
4143                        const int next_exon_start)
4144 {
4145     if (insert->SetMix().Set().size() > 1) {
4146         NCBI_THROW(CException, eUnknown, "spliced-seq with several insert exons in a row not supported");
4147     }
4148 
4149     if (insert->SetMix().Set().size() > 0) {
4150         int half_intron_length = (next_exon_start - region_end)/2;
4151         int copy_length = min(k_gap_length, half_intron_length);
4152         region_end += copy_length;
4153 
4154         if (region_begin < region_end) {
4155             CRef<CSeq_loc> genome_loc(new CSeq_loc(genomic_seqid,
4156                                                    region_begin,
4157                                                    region_end -1));
4158             edited_sequence_seqloc->SetMix().Set().push_back(genome_loc);
4159         }
4160         if (copy_length < k_gap_length) {
4161             int gap_length = k_gap_length - copy_length;
4162             // fill gap with sequence from the genome itself for simplicity
4163             // do not bother creating nonexisting sequence
4164             CRef<CSeq_loc> gap_loc(new CSeq_loc(genomic_seqid, 0, gap_length-1));
4165             edited_sequence_seqloc->SetMix().Set().push_back(gap_loc);
4166             offset += gap_length;
4167         }
4168 
4169         edited_sequence_seqloc->SetMix().Set().push_back(insert);
4170         insert.Reset(new CSeq_loc);
4171 
4172         if (copy_length < k_gap_length) {
4173             int gap_length = k_gap_length - copy_length;
4174             CRef<CSeq_loc> gap_loc(new CSeq_loc(genomic_seqid, 0, gap_length-1));
4175             edited_sequence_seqloc->SetMix().Set().push_back(gap_loc);
4176             offset += gap_length;
4177         }
4178 
4179         region_begin = region_end;
4180     }
4181 }
4182 
4183 CRef<CSeq_feat>
4184 CFeatureGenerator::
ConvertMixedAlignToAnnot(const CSeq_align & input_align,CSeq_annot & annot,CBioseq_set & seqs,Int8 gene_id,const CSeq_feat * cds_feat_on_query_mrna_ptr,bool call_on_align_list)4185 SImplementation::ConvertMixedAlignToAnnot(const CSeq_align& input_align,
4186                                      CSeq_annot& annot,
4187                                      CBioseq_set& seqs,
4188                                      Int8 gene_id,
4189                                      const CSeq_feat* cds_feat_on_query_mrna_ptr,
4190                                      bool call_on_align_list)
4191 {
4192 
4193     CRef<CSeq_align> align(new CSeq_align);
4194     align->Assign(input_align);
4195 
4196     CRef<CSeq_loc> edited_sequence_seqloc(new CSeq_loc);
4197 
4198 
4199     CSpliced_seg& spliced_seg = align->SetSegs().SetSpliced();
4200     if (!spliced_seg.CanGetGenomic_id()) {
4201         NCBI_THROW(CException, eUnknown, "Mixed-genomic spliced-seq does not have spliced-seg.genomic_id");
4202     }
4203     CRef<CSeq_id> genomic_seqid(new CSeq_id);
4204     genomic_seqid->Assign(spliced_seg.GetGenomic_id());
4205     ENa_strand genomic_strand = eNa_strand_plus;
4206     if (spliced_seg.CanGetGenomic_strand()) {
4207         genomic_strand = spliced_seg.GetGenomic_strand();
4208     } else {
4209         ITERATE(CSpliced_seg::TExons, it, spliced_seg.GetExons()) {
4210             const CSpliced_exon& exon = **it;
4211             if ((!exon.CanGetGenomic_id() || exon.GetGenomic_id().Match(*genomic_seqid)) &&
4212                 exon.CanGetGenomic_strand()) {
4213                 genomic_strand = exon.GetGenomic_strand();
4214                 break;
4215             }
4216         }
4217     }
4218 
4219     CSeq_id_Handle idh= CSeq_id_Handle::GetHandle(*genomic_seqid);
4220     CBioseq_Handle bsh = m_scope->GetBioseqHandle(idh);
4221     TSeqPos genomic_length = bsh.GetBioseqLength();
4222 
4223     { // collect genomic seqlocs for virtual sequence and map exons to it
4224 
4225         const int k_gap_length = min(1000, int(genomic_length));
4226 
4227     if (genomic_strand == eNa_strand_minus) {
4228         // reverse exons to process them same way as plus strand
4229         // will reverse back in the end
4230         spliced_seg.SetExons().reverse();
4231     }
4232     int region_begin = 0; //included endpoint
4233     int region_end = 0;   //not included endpoint
4234     int offset = 0;
4235     CRef<CSeq_loc> insert(new CSeq_loc);
4236     NON_CONST_ITERATE(CSpliced_seg::TExons, it, spliced_seg.SetExons()) {
4237         CSpliced_exon& exon = **it;
4238         CSeq_id& seqid = exon.CanGetGenomic_id() ? exon.SetGenomic_id() : *genomic_seqid;
4239 
4240         int exon_start = exon.GetGenomic_start(); // included endpoint
4241         int exon_stop = exon.GetGenomic_end();    // included endpoint
4242 
4243         if (!seqid.Match(*genomic_seqid)) {
4244 
4245             ENa_strand strand = exon.CanGetGenomic_strand() ? exon.GetGenomic_strand() : genomic_strand;
4246             CRef<CSeq_loc> loc(new CSeq_loc(seqid, exon_start, exon_stop, strand));
4247             if (genomic_strand == eNa_strand_minus) {
4248                 loc->FlipStrand();
4249             }
4250             insert->SetMix().Set().push_back(loc);
4251 
4252             int exon_length = exon_stop - exon_start +1;
4253             exon_stop = region_end + k_gap_length -1;
4254             exon_start = region_end + k_gap_length - exon_length;
4255             offset += exon_length;
4256         } else {
4257             if (exon.CanGetGenomic_strand() && exon.GetGenomic_strand() != genomic_strand) {
4258                 NCBI_THROW(CException, eUnknown, "spliced-seq with mixed genomic strands not supported");
4259             }
4260             if (!(region_end <= exon_start)) {
4261                 NCBI_THROW(CException, eUnknown, "spliced-seq with exons out of order not supported");
4262             }
4263 
4264             AddInsertWithGaps(edited_sequence_seqloc,
4265                               *genomic_seqid,
4266                               region_begin,
4267                               region_end,
4268                               offset,
4269                               insert,
4270                               k_gap_length,
4271                               exon_start);
4272 
4273             region_end = exon_stop +1;
4274         }
4275 
4276         exon.ResetGenomic_id();
4277         exon.ResetGenomic_strand();
4278 
4279         exon.SetGenomic_start(exon_start + offset);
4280         exon.SetGenomic_end(exon_stop + offset);
4281     }
4282 
4283     AddInsertWithGaps(edited_sequence_seqloc,
4284                       *genomic_seqid,
4285                       region_begin,
4286                       region_end,
4287                       offset,
4288                       insert,
4289                       k_gap_length,
4290                       genomic_length);
4291 
4292     if (region_begin < (int)genomic_length) {
4293         CRef<CSeq_loc> genome_loc(new CSeq_loc(*genomic_seqid,
4294                                                region_begin,
4295                                                genomic_length -1));
4296         edited_sequence_seqloc->SetMix().Set().push_back(genome_loc);
4297     }
4298 
4299     if (genomic_strand == eNa_strand_minus) {
4300         // reverse exons back
4301         spliced_seg.SetExons().reverse();
4302     }
4303     spliced_seg.SetGenomic_strand(genomic_strand);
4304     }
4305 
4306     edited_sequence_seqloc->ChangeToPackedInt();
4307     CRef<CBioseq> bioseq(new CBioseq(*edited_sequence_seqloc));
4308     CRef<CSeq_entry> seqentry(new CSeq_entry);
4309     seqentry->SetSeq(*bioseq);
4310 
4311     bioseq->SetInst().SetTopology() = bsh.GetCompleteBioseq()->GetInst().GetTopology();
4312     {{
4313         CSeqdesc_CI desc(bsh, CSeqdesc::e_Source);
4314         if (desc) {
4315             CRef<CSeqdesc> seq_desc(new CSeqdesc);
4316             seq_desc->Assign(*desc);
4317             bioseq->SetDescr().Set().push_back(seq_desc);
4318         }
4319     }}
4320     {{
4321         CSeqdesc_CI desc(bsh, CSeqdesc::e_Org);
4322         if (desc) {
4323             CRef<CSeqdesc> seq_desc(new CSeqdesc);
4324             seq_desc->Assign(*desc);
4325             bioseq->SetDescr().Set().push_back(seq_desc);
4326         }
4327     }}
4328 
4329     CBioseq_Handle bioseq_handle = m_scope->AddBioseq(*bioseq);
4330 
4331     CRef<CSeq_id> bioseq_id(new CSeq_id);
4332     bioseq_id->Assign(*bioseq->GetFirstId());
4333     spliced_seg.SetGenomic_id(*bioseq_id);
4334 
4335     CRef<CSeq_feat> gene_feat;
4336     if (gene_id) {
4337         TGeneMap::iterator gene = genes.find(gene_id);
4338         if (gene != genes.end()) {
4339             gene_feat = gene->second;
4340             genes.erase(gene);
4341         }
4342     }
4343 
4344     CSeq_annot annot_local;
4345     CBioseq_set seqs_tmp;
4346     ConvertAlignToAnnot(*align, annot_local, seqs_tmp, gene_id, cds_feat_on_query_mrna_ptr,
4347                                                call_on_align_list);
4348 
4349     m_scope->RemoveBioseq(bioseq_handle);
4350     annot_local.SetData().SetFtable().clear();
4351 
4352     if (gene_id) {
4353         if (gene_feat) {
4354             genes[gene_id] = gene_feat;
4355         } else {
4356             genes.erase(gene_id);
4357         }
4358     }
4359 
4360     set<CSeq_id_Handle> insert_ids;
4361     TSeqPos insert_length = 0;
4362     TSeqPos cds_insert_length = 0;
4363 
4364     align.Reset(new CSeq_align);
4365     align->Assign(input_align);
4366     {
4367         CSpliced_seg& spliced_seg = align->SetSegs().SetSpliced();
4368         ERASE_ITERATE(CSpliced_seg::TExons, it, spliced_seg.SetExons()) {
4369             CSpliced_exon& exon = **it;
4370             CSeq_id& seqid = exon.CanGetGenomic_id() ? exon.SetGenomic_id() : *genomic_seqid;
4371 
4372             if (!seqid.Match(*genomic_seqid)) {
4373                 insert_ids.insert(CSeq_id_Handle::GetHandle(seqid));
4374                 insert_length += exon.GetGenomic_end()-exon.GetGenomic_start()+1;
4375 
4376                 if (cds_feat_on_query_mrna_ptr) {
4377                     int cds_intersection_len =
4378                         min(exon.GetProduct_end().GetNucpos(),
4379                             cds_feat_on_query_mrna_ptr->GetLocation().GetStop(eExtreme_Positional)) -
4380                         max(exon.GetProduct_start().GetNucpos(),
4381                             cds_feat_on_query_mrna_ptr->GetLocation().GetStart(eExtreme_Positional))
4382                         +1;
4383                     if (cds_intersection_len > 0) {
4384                         cds_insert_length += cds_intersection_len;
4385                     }
4386                 }
4387 
4388                 spliced_seg.SetExons().erase(it);
4389             }
4390         }
4391     }
4392 
4393     CBioseq_set seqs_discard;
4394     CRef<CSeq_feat> feat =
4395         ConvertAlignToAnnot(*align, annot_local, seqs_discard,
4396                             gene_id, cds_feat_on_query_mrna_ptr,
4397                             call_on_align_list);
4398 
4399     // inst.hist.assembly = input annot
4400     align.Reset(new CSeq_align);
4401     align->Assign(input_align);
4402     NON_CONST_ITERATE(CBioseq_set::TSeq_set, it, seqs_tmp.SetSeq_set()) {
4403         CSeq_entry& entry = **it;
4404         if (entry.IsSeq() &&
4405             entry.GetSeq().IsSetInst() &&
4406             entry.GetSeq().GetInst().IsSetHist() &&
4407             entry.GetSeq().GetInst().GetHist().IsSetAssembly()) {
4408 
4409             entry.SetSeq().SetInst().SetHist().SetAssembly().front() =
4410                 align;
4411             break;
4412         }
4413     }
4414     if (seqs_tmp.IsSetClass()) {
4415         seqs.SetClass(seqs_tmp.GetClass());
4416     }
4417     seqs.SetSeq_set().splice(seqs.SetSeq_set().end(), seqs_tmp.SetSeq_set());
4418 
4419     for (list<CRef<CSeq_feat> >::reverse_iterator it = annot_local.SetData().SetFtable().rbegin();
4420          it != annot_local.SetData().SetFtable().rend(); ++it) {
4421         CSeq_feat& f = **it;
4422         if (f.GetData().IsGene()) {
4423             continue;
4424         }
4425 
4426         if (f.GetData().IsCdregion() && cds_insert_length==0) {
4427             continue;
4428         }
4429 
4430         x_SetExceptText(f, k_except_text_for_gap_filled_gnomon_model);
4431         _ASSERT(insert_ids.size() > 0);
4432         NON_CONST_ITERATE (set<CSeq_id_Handle>, id, insert_ids) {
4433             x_SetQualForGapFilledModel(f, *id);
4434         }
4435         x_SetCommentForGapFilledModel(f, f.GetData().IsCdregion() ? cds_insert_length : insert_length);
4436     }
4437 
4438     annot.SetData().SetFtable().splice(annot.SetData().SetFtable().end(),
4439                                        annot_local.SetData().SetFtable());
4440 
4441     return feat;
4442 }
4443 
4444 END_NCBI_SCOPE
4445