1 /*  $Id: glimmer_reader.cpp 618513 2020-10-21 16:07:12Z grichenk $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Mike DiCuccio
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <objtools/readers/glimmer_reader.hpp>
34 #include <objtools/error_codes.hpp>
35 #include <objects/seqfeat/Seq_feat.hpp>
36 #include <objects/seqfeat/Cdregion.hpp>
37 #include <objects/seqfeat/Genetic_code.hpp>
38 #include <objects/seqfeat/Genetic_code_table.hpp>
39 #include <objects/seqloc/Seq_interval.hpp>
40 #include <objects/seq/Seq_data.hpp>
41 #include <objmgr/util/sequence.hpp>
42 
43 #include <corelib/ncbiutil.hpp>
44 
45 
46 #define NCBI_USE_ERRCODE_X   Objtools_Rd_Glimmer
47 
48 BEGIN_NCBI_SCOPE
49 USING_SCOPE(objects);
50 
51 
CGlimmerReader()52 CGlimmerReader::CGlimmerReader()
53 {
54 }
55 
56 
Read(CNcbiIstream & istr,CScope & scope,int genetic_code_idx)57 CRef<CSeq_entry> CGlimmerReader::Read(CNcbiIstream& istr, CScope& scope,
58                                       int genetic_code_idx)
59 {
60     CRef<CSeq_entry> entry(new CSeq_entry);
61     CRef<CSeq_annot> annot(new CSeq_annot);
62     entry->SetSet().SetSeq_set();
63     entry->SetSet().SetAnnot().push_back(annot);
64 
65     /// parse line by line
66     /// we will be lax and skip enpty lines; we will also permit a few comments
67     string line;
68     string defline;
69     CSeq_id_Handle idh;
70     TSeqPos seq_length = 0;
71 
72     size_t errs = 0;
73     size_t count = 0;
74     while (NcbiGetlineEOL(istr, line)) {
75         ++count;
76         if (line.empty()  ||  line[0] == '#'  ||
77             (line.size() >= 2  &&  line[0] == '/'  &&  line[1] == '/')) {
78             continue;
79         }
80 
81         if (defline.empty()) {
82             if (line[0] == '>') {
83                 defline = line;
84                 string s = defline;
85                 string::size_type pos = s.find_first_of(" ");
86                 if (pos != string::npos) {
87                     s.erase(pos);
88                 }
89                 s.erase(0, 1);
90 
91                 CBioseq::TId ids;
92                 CSeq_id::ParseFastaIds(ids, s);
93 
94                 CRef<CSeq_id> best = FindBestChoice(ids, CSeq_id::Score);
95                 idh = CSeq_id_Handle::GetHandle(*best);
96 
97                 CBioseq_Handle bsh = scope.GetBioseqHandle(idh);
98                 if ( !bsh ) {
99                     NCBI_THROW(CException, eUnknown,
100                                "Failed to find sequence: " + s);
101                 }
102                 seq_length = bsh.GetBioseqLength();
103             } else {
104                 CNcbiOstrstream ostr;
105                 ostr << "CGlimmerReader::ReadAnnot(): line "
106                     << count << ": failed to identify defline: " << line;
107                 string msg = string(CNcbiOstrstreamToString(ostr));
108                 ERR_POST_X(1, Error << msg);
109                 NCBI_THROW(CException, eUnknown, msg);
110             }
111         } else {
112             list<string> toks;
113             NStr::Split(line, " \t", toks, NStr::fSplit_Tokenize);
114             if (toks.size() != 5) {
115                 CNcbiOstrstream ostr;
116                 ostr << "CGlimmerReader::ReadAnnot(): line "
117                     << count << ": invalid number of tokens: "
118                     << "found " << toks.size() << ", expected 5: " << line;
119                 string msg = string(CNcbiOstrstreamToString(ostr));
120                 ERR_POST_X(2, Error << msg);
121                 ++errs;
122                 if (errs > 5) {
123                     NCBI_THROW(CException, eUnknown, msg);
124                 }
125             }
126 
127             list<string>::iterator it = toks.begin();
128 
129             /// token 1: ORF identifier
130             string orf_name = *it++;
131 
132             /// token 2: start position
133             TSeqPos start_pos = 0;
134             try {
135                 start_pos = NStr::StringToInt(*it++);
136                 start_pos -= 1;
137             }
138             catch (CException&) {
139                 CNcbiOstrstream ostr;
140                 ostr << "CGlimmerReader::ReadAnnot(): line "
141                     << count << ": failed to identify start pos: " << line;
142                 string msg = string(CNcbiOstrstreamToString(ostr));
143                 ERR_POST_X(3, Error << msg);
144 
145                 ++errs;
146                 if (errs > 5) {
147                     NCBI_THROW(CException, eUnknown, msg);
148                 } else {
149                     continue;
150                 }
151             }
152 
153             /// token 3: stop position
154             TSeqPos stop_pos = 0;
155             try {
156                 stop_pos = NStr::StringToInt(*it++);
157                 stop_pos -= 1;
158             }
159             catch (CException&) {
160                 CNcbiOstrstream ostr;
161                 ostr << "CGlimmerReader::ReadAnnot(): line "
162                     << count << ": failed to identify stop pos: " << line;
163                 string msg = string(CNcbiOstrstreamToString(ostr));
164                 ERR_POST_X(4, Error << msg);
165 
166                 ++errs;
167                 if (errs > 5) {
168                     NCBI_THROW(CException, eUnknown, msg);
169                 } else {
170                     continue;
171                 }
172             }
173 
174             /// stop may be less than start!
175 
176             /// token 4: frame + strand
177             ENa_strand strand = eNa_strand_plus;
178             try {
179                 int frame = NStr::StringToInt(*it++);
180                 if (frame > 3  ||  frame < -3) {
181                     NCBI_THROW(CException, eUnknown, "frame out of range");
182                 }
183 
184                 if (frame < 0) {
185                     strand = eNa_strand_minus;
186                 }
187             }
188             catch (CException&) {
189                 CNcbiOstrstream ostr;
190                 ostr << "CGlimmerReader::ReadAnnot(): line "
191                     << count << ": failed to identify frame: " << line;
192                 string msg = string(CNcbiOstrstreamToString(ostr));
193                 ERR_POST_X(5, Error << msg);
194 
195                 ++errs;
196                 if (errs > 5) {
197                     NCBI_THROW(CException, eUnknown, msg);
198                 } else {
199                     continue;
200                 }
201             }
202 
203             /// token 5: score
204             //double score = 0;
205             try {
206                 /*score =*/ NStr::StringToDouble(*it++);
207             }
208             catch (CException&) {
209                 CNcbiOstrstream ostr;
210                 ostr << "CGlimmerReader::ReadAnnot(): line "
211                     << count << ": failed to identify score: " << line;
212                 string msg = string(CNcbiOstrstreamToString(ostr));
213                 ERR_POST_X(6, Error << msg);
214 
215                 ++errs;
216                 if (errs > 5) {
217                     NCBI_THROW(CException, eUnknown, msg);
218                 } else {
219                     continue;
220                 }
221             }
222 
223             ///
224             /// build our features
225             ///
226 
227             /// CDS feat
228             CRef<CSeq_feat> cds_feat(new CSeq_feat());
229             if (strand == eNa_strand_plus  &&  start_pos > stop_pos) {
230                 /// circular cds_feature; make two intervals
231                 CRef<CSeq_interval> ival;
232 
233                 ival.Reset(new CSeq_interval);
234                 ival->SetFrom(start_pos);
235                 ival->SetTo  (seq_length - 1);
236                 cds_feat->SetLocation().SetPacked_int().Set().push_back(ival);
237 
238                 ival.Reset(new CSeq_interval);
239                 ival->SetFrom(0);
240                 ival->SetTo  (stop_pos);
241                 cds_feat->SetLocation().SetPacked_int().Set().push_back(ival);
242 
243             } else if (strand == eNa_strand_minus  &&  start_pos < stop_pos) {
244                 /// circular cds_feature; make two intervals
245                 CRef<CSeq_interval> ival;
246 
247                 ival.Reset(new CSeq_interval);
248                 ival->SetFrom(0);
249                 ival->SetTo  (start_pos);
250                 cds_feat->SetLocation().SetPacked_int().Set().push_back(ival);
251 
252                 ival.Reset(new CSeq_interval);
253                 ival->SetFrom(stop_pos);
254                 ival->SetTo  (seq_length - 1);
255                 cds_feat->SetLocation().SetPacked_int().Set().push_back(ival);
256 
257             } else {
258                 cds_feat->SetLocation().SetInt().SetFrom(min(start_pos, stop_pos));
259                 cds_feat->SetLocation().SetInt().SetTo  (max(start_pos, stop_pos));
260             }
261             cds_feat->SetLocation().SetStrand(strand);
262             cds_feat->SetLocation().SetId(*idh.GetSeqId());
263 
264             CCdregion& cdr = cds_feat->SetData().SetCdregion();
265             if (genetic_code_idx) {
266                 CRef<CGenetic_code::C_E> d(new CGenetic_code::C_E);
267                 d->SetId(genetic_code_idx);
268                 cdr.SetCode().Set().push_back(d);
269             }
270 
271             CRef<CSeq_feat> gene_feat(new CSeq_feat);
272             gene_feat->SetData().SetGene().SetLocus(orf_name);
273             gene_feat->SetLocation().Assign(cds_feat->GetLocation());
274 
275             annot->SetData().SetFtable().push_back(gene_feat);
276             annot->SetData().SetFtable().push_back(cds_feat);
277         }
278     }
279     LOG_POST_X(7, Info << "CGlimmerReader::Read(): parsed " << count << " lines, " << errs << " errors");
280 
281     string prefix("lcl|prot");
282     count = 0;
283     NON_CONST_ITERATE (CSeq_annot::TData::TFtable, it, annot->SetData().SetFtable()) {
284         CSeq_feat& feat = **it;
285         if (feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_cdregion) {
286             continue;
287         }
288 
289         CRef<CSeq_entry> sub_entry(new CSeq_entry);
290         CBioseq& bioseq = sub_entry->SetSeq();
291 
292         /// establish our inst
293         CSeq_inst& inst = bioseq.SetInst();
294         CSeqTranslator::Translate(**it, scope,
295                                   inst.SetSeq_data().SetIupacaa().Set(),
296                                   false /* trim trailing stop */);
297         inst.SetRepr(CSeq_inst::eRepr_raw);
298         inst.SetMol(CSeq_inst::eMol_aa);
299         inst.SetLength(inst.SetSeq_data().SetIupacaa().Set().size());
300 
301         /// create a readable seq-id
302         CNcbiOstrstream ostr;
303         ostr << prefix << setw(7) << setfill('0') << ++count;
304         string id_str = string(CNcbiOstrstreamToString(ostr));
305 
306         CRef<CSeq_id> id(new CSeq_id(id_str));
307         bioseq.SetId().push_back(id);
308 
309         /// set the product on the feature
310         feat.SetProduct().SetWhole().Assign(*id);
311 
312         /// save our bioseq
313         /// this is done last to preserve our data in a serializable form
314         entry->SetSet().SetSeq_set().push_back(sub_entry);
315     }
316 
317     return entry;
318 }
319 
320 
321 END_NCBI_SCOPE
322