1 /*
2 * $Id: NSeq.cpp 494932 2016-03-11 18:54:17Z kiryutin $
3 *
4 * =========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannt warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * =========================================================================
27 *
28 * Author: Boris Kiryutin
29 *
30 * =========================================================================
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35
36 #include <algo/align/prosplign/prosplign_exception.hpp>
37
38 #include "NSeq.hpp"
39 #include "nucprot.hpp"
40
41 #include <objmgr/util/seq_loc_util.hpp>
42 #include <objmgr/seq_vector.hpp>
43
44 BEGIN_NCBI_SCOPE
45 BEGIN_SCOPE(prosplign)
46 USING_SCOPE(ncbi::objects);
47
CNSeq(void)48 CNSeq::CNSeq(void)
49 {
50 m_size = 0;
51 }
52
~CNSeq(void)53 CNSeq::~CNSeq(void)
54 {
55 }
56
57 // letter by position
Upper(int pos) const58 char CNSeq::Upper(int pos) const
59 {
60 return CTranslationTable::NucToChar(seq[pos]);
61 }
62
63
Init(const CNSeq & fullseq,const vector<pair<int,int>> & igi)64 void CNSeq::Init(const CNSeq& fullseq, const vector<pair<int, int> >& igi)
65 {
66 seq.clear();
67 NSEQ::const_iterator beg, end;
68 beg = fullseq.seq.begin();
69 m_size = fullseq.m_size;
70 _ASSERT((int)fullseq.seq.size() >= fullseq.size());
71 for(vector<pair<int, int> >::const_iterator it = igi.begin(); it != igi.end(); ++it) {
72 if(it->first < 0 || it->second < 1) NCBI_THROW(CProSplignException, eGenericError, "Intron coordinates are invalid");
73 if(it->first + it->second > fullseq.size()) NCBI_THROW(CProSplignException, eGenericError, "Intron coordinate is out of sequence");
74 end = fullseq.seq.begin() + it->first;
75 if(end < beg) NCBI_THROW(CProSplignException, eGenericError, "Intron coordinates have wrong order");
76 seq.insert(seq.end(), beg, end);
77 beg = end + it->second;
78 m_size -= it->second;
79 }
80 if(beg < fullseq.seq.end()) seq.insert(seq.end(), beg, fullseq.seq.end());
81 }
82
Init(CScope & scope,CSeq_loc & genomic)83 void CNSeq::Init(CScope& scope, CSeq_loc& genomic)
84 {
85 CRef<CSeq_id> seqid( new CSeq_id );
86 seqid->Assign(*genomic.GetId());
87
88 TSeqPos loc_end = sequence::GetStop(genomic, &scope);
89 TSeqPos seq_end = sequence::GetLength(*genomic.GetId(), &scope)-1;
90
91 CRef<CSeq_loc> extended_seqloc(new CSeq_loc);
92 if (loc_end<=seq_end)
93 extended_seqloc->Assign(genomic);
94 else {
95 CRef<CSeq_loc> extra_seqloc(new CSeq_loc(*seqid,seq_end+1,loc_end,genomic.GetStrand()) );
96 extended_seqloc = sequence::Seq_loc_Subtract(genomic,*extra_seqloc,CSeq_loc::fMerge_All|CSeq_loc::fSort,&scope);
97 if(extended_seqloc.Empty() || extended_seqloc->IsNull() || extended_seqloc->IsEmpty() ) {
98 NCBI_THROW(CProSplignException, eGenericError, "[from,to] range provided is out of sequence.");
99 }
100 extended_seqloc->SetId(*seqid); // Seq_loc_Subtract might change the id, e.g. replace accession with gi
101 genomic.Assign(*extended_seqloc);
102 }
103
104 m_size = sequence::GetLength(*extended_seqloc,&scope);
105
106 if (IsForward(genomic.GetStrand())) {
107 TSeqPos pos = sequence::GetStop(*extended_seqloc, &scope);
108 if (pos<seq_end) {
109 TSeqPos pos1 = pos + 3;
110 if (pos1 > seq_end)
111 pos1 = seq_end;
112 CRef<CSeq_loc> extra_seqloc(new CSeq_loc(*seqid,pos,pos1,eNa_strand_plus) );
113 extended_seqloc = sequence::Seq_loc_Add(*extended_seqloc,*extra_seqloc,CSeq_loc::fMerge_All|CSeq_loc::fSort,&scope);
114 }
115 } else {
116 TSeqPos pos = sequence::GetStart(*extended_seqloc, &scope);
117 if (pos > 0) {
118 TSeqPos pos0 = pos < 3 ? 0 : pos - 3;
119 CRef<CSeq_loc> extra_seqloc(new CSeq_loc(*seqid,pos0,pos-1,eNa_strand_minus) );
120 extended_seqloc = sequence::Seq_loc_Add(*extended_seqloc,*extra_seqloc,CSeq_loc::fMerge_All|CSeq_loc::fSort,&scope);
121 }
122 }
123
124 CSeqVector seq_vec(*extended_seqloc,scope,CBioseq_Handle::eCoding_Ncbi);
125
126 vector<char> convert(16,nN);
127 convert[1] = nA;
128 convert[2] = nC;
129 convert[4] = nG;
130 convert[8] = nT;
131
132 seq.clear();
133 for (CSeqVector_CI i(seq_vec); i; ++i) {
134 seq.push_back(convert[*i&0xf]);
135 }
136 }
137
138
139 END_SCOPE(prosplign)
140 END_NCBI_SCOPE
141