1 /*
2 *  $Id: NSeq.cpp 494932 2016-03-11 18:54:17Z kiryutin $
3 *
4 * =========================================================================
5 *
6 *                            PUBLIC DOMAIN NOTICE
7 *               National Center for Biotechnology Information
8 *
9 *  This software/database is a "United States Government Work" under the
10 *  terms of the United States Copyright Act.  It was written as part of
11 *  the author's official duties as a United States Government employee and
12 *  thus cannot be copyrighted.  This software/database is freely available
13 *  to the public for use. The National Library of Medicine and the U.S.
14 *  Government have not placed any restriction on its use or reproduction.
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannt warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 *  Please cite the author in any work or product based on this material.
25 *
26 * =========================================================================
27 *
28 *  Author: Boris Kiryutin
29 *
30 * =========================================================================
31 */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 
36 #include <algo/align/prosplign/prosplign_exception.hpp>
37 
38 #include "NSeq.hpp"
39 #include "nucprot.hpp"
40 
41 #include <objmgr/util/seq_loc_util.hpp>
42 #include <objmgr/seq_vector.hpp>
43 
44 BEGIN_NCBI_SCOPE
45 BEGIN_SCOPE(prosplign)
46 USING_SCOPE(ncbi::objects);
47 
CNSeq(void)48 CNSeq::CNSeq(void)
49 {
50     m_size = 0;
51 }
52 
~CNSeq(void)53 CNSeq::~CNSeq(void)
54 {
55 }
56 
57 // letter by position
Upper(int pos) const58 char CNSeq::Upper(int pos) const
59 {
60     return CTranslationTable::NucToChar(seq[pos]);
61 }
62 
63 
Init(const CNSeq & fullseq,const vector<pair<int,int>> & igi)64 void CNSeq::Init(const CNSeq& fullseq, const vector<pair<int, int> >& igi)
65 {
66     seq.clear();
67     NSEQ::const_iterator beg, end;
68     beg = fullseq.seq.begin();
69     m_size = fullseq.m_size;
70     _ASSERT((int)fullseq.seq.size() >= fullseq.size());
71     for(vector<pair<int, int> >::const_iterator it = igi.begin(); it != igi.end(); ++it) {
72         if(it->first < 0 || it->second < 1) NCBI_THROW(CProSplignException, eGenericError, "Intron coordinates are invalid");
73         if(it->first + it->second > fullseq.size()) NCBI_THROW(CProSplignException, eGenericError, "Intron coordinate is out of sequence");
74         end = fullseq.seq.begin() + it->first;
75         if(end < beg) NCBI_THROW(CProSplignException, eGenericError, "Intron coordinates have wrong order");
76         seq.insert(seq.end(), beg, end);
77         beg = end + it->second;
78         m_size -= it->second;
79     }
80     if(beg < fullseq.seq.end()) seq.insert(seq.end(), beg, fullseq.seq.end());
81 }
82 
Init(CScope & scope,CSeq_loc & genomic)83 void CNSeq::Init(CScope& scope, CSeq_loc& genomic)
84 {
85     CRef<CSeq_id> seqid( new CSeq_id );
86     seqid->Assign(*genomic.GetId());
87 
88     TSeqPos loc_end = sequence::GetStop(genomic, &scope);
89     TSeqPos seq_end = sequence::GetLength(*genomic.GetId(), &scope)-1;
90 
91     CRef<CSeq_loc> extended_seqloc(new CSeq_loc);
92     if (loc_end<=seq_end)
93         extended_seqloc->Assign(genomic);
94     else {
95         CRef<CSeq_loc> extra_seqloc(new CSeq_loc(*seqid,seq_end+1,loc_end,genomic.GetStrand()) );
96         extended_seqloc = sequence::Seq_loc_Subtract(genomic,*extra_seqloc,CSeq_loc::fMerge_All|CSeq_loc::fSort,&scope);
97         if(extended_seqloc.Empty() || extended_seqloc->IsNull() || extended_seqloc->IsEmpty() ) {
98            NCBI_THROW(CProSplignException, eGenericError, "[from,to] range provided is out of sequence.");
99         }
100         extended_seqloc->SetId(*seqid); // Seq_loc_Subtract might change the id, e.g. replace accession with gi
101         genomic.Assign(*extended_seqloc);
102     }
103 
104     m_size = sequence::GetLength(*extended_seqloc,&scope);
105 
106     if (IsForward(genomic.GetStrand())) {
107         TSeqPos pos = sequence::GetStop(*extended_seqloc, &scope);
108         if (pos<seq_end) {
109             TSeqPos pos1 = pos + 3;
110             if (pos1 > seq_end)
111                 pos1 = seq_end;
112             CRef<CSeq_loc> extra_seqloc(new CSeq_loc(*seqid,pos,pos1,eNa_strand_plus) );
113             extended_seqloc = sequence::Seq_loc_Add(*extended_seqloc,*extra_seqloc,CSeq_loc::fMerge_All|CSeq_loc::fSort,&scope);
114         }
115     } else {
116         TSeqPos pos = sequence::GetStart(*extended_seqloc, &scope);
117         if (pos > 0) {
118             TSeqPos  pos0 = pos < 3 ? 0 : pos - 3;
119             CRef<CSeq_loc> extra_seqloc(new CSeq_loc(*seqid,pos0,pos-1,eNa_strand_minus) );
120             extended_seqloc = sequence::Seq_loc_Add(*extended_seqloc,*extra_seqloc,CSeq_loc::fMerge_All|CSeq_loc::fSort,&scope);
121         }
122     }
123 
124     CSeqVector seq_vec(*extended_seqloc,scope,CBioseq_Handle::eCoding_Ncbi);
125 
126     vector<char> convert(16,nN);
127     convert[1] = nA;
128     convert[2] = nC;
129     convert[4] = nG;
130     convert[8] = nT;
131 
132     seq.clear();
133     for (CSeqVector_CI i(seq_vec); i; ++i) {
134         seq.push_back(convert[*i&0xf]);
135     }
136 }
137 
138 
139 END_SCOPE(prosplign)
140 END_NCBI_SCOPE
141