1 /* $Id: glimmer_reader.cpp 618513 2020-10-21 16:07:12Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Mike DiCuccio
27 *
28 * File Description:
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <objtools/readers/glimmer_reader.hpp>
34 #include <objtools/error_codes.hpp>
35 #include <objects/seqfeat/Seq_feat.hpp>
36 #include <objects/seqfeat/Cdregion.hpp>
37 #include <objects/seqfeat/Genetic_code.hpp>
38 #include <objects/seqfeat/Genetic_code_table.hpp>
39 #include <objects/seqloc/Seq_interval.hpp>
40 #include <objects/seq/Seq_data.hpp>
41 #include <objmgr/util/sequence.hpp>
42
43 #include <corelib/ncbiutil.hpp>
44
45
46 #define NCBI_USE_ERRCODE_X Objtools_Rd_Glimmer
47
48 BEGIN_NCBI_SCOPE
49 USING_SCOPE(objects);
50
51
CGlimmerReader()52 CGlimmerReader::CGlimmerReader()
53 {
54 }
55
56
Read(CNcbiIstream & istr,CScope & scope,int genetic_code_idx)57 CRef<CSeq_entry> CGlimmerReader::Read(CNcbiIstream& istr, CScope& scope,
58 int genetic_code_idx)
59 {
60 CRef<CSeq_entry> entry(new CSeq_entry);
61 CRef<CSeq_annot> annot(new CSeq_annot);
62 entry->SetSet().SetSeq_set();
63 entry->SetSet().SetAnnot().push_back(annot);
64
65 /// parse line by line
66 /// we will be lax and skip enpty lines; we will also permit a few comments
67 string line;
68 string defline;
69 CSeq_id_Handle idh;
70 TSeqPos seq_length = 0;
71
72 size_t errs = 0;
73 size_t count = 0;
74 while (NcbiGetlineEOL(istr, line)) {
75 ++count;
76 if (line.empty() || line[0] == '#' ||
77 (line.size() >= 2 && line[0] == '/' && line[1] == '/')) {
78 continue;
79 }
80
81 if (defline.empty()) {
82 if (line[0] == '>') {
83 defline = line;
84 string s = defline;
85 string::size_type pos = s.find_first_of(" ");
86 if (pos != string::npos) {
87 s.erase(pos);
88 }
89 s.erase(0, 1);
90
91 CBioseq::TId ids;
92 CSeq_id::ParseFastaIds(ids, s);
93
94 CRef<CSeq_id> best = FindBestChoice(ids, CSeq_id::Score);
95 idh = CSeq_id_Handle::GetHandle(*best);
96
97 CBioseq_Handle bsh = scope.GetBioseqHandle(idh);
98 if ( !bsh ) {
99 NCBI_THROW(CException, eUnknown,
100 "Failed to find sequence: " + s);
101 }
102 seq_length = bsh.GetBioseqLength();
103 } else {
104 CNcbiOstrstream ostr;
105 ostr << "CGlimmerReader::ReadAnnot(): line "
106 << count << ": failed to identify defline: " << line;
107 string msg = string(CNcbiOstrstreamToString(ostr));
108 ERR_POST_X(1, Error << msg);
109 NCBI_THROW(CException, eUnknown, msg);
110 }
111 } else {
112 list<string> toks;
113 NStr::Split(line, " \t", toks, NStr::fSplit_Tokenize);
114 if (toks.size() != 5) {
115 CNcbiOstrstream ostr;
116 ostr << "CGlimmerReader::ReadAnnot(): line "
117 << count << ": invalid number of tokens: "
118 << "found " << toks.size() << ", expected 5: " << line;
119 string msg = string(CNcbiOstrstreamToString(ostr));
120 ERR_POST_X(2, Error << msg);
121 ++errs;
122 if (errs > 5) {
123 NCBI_THROW(CException, eUnknown, msg);
124 }
125 }
126
127 list<string>::iterator it = toks.begin();
128
129 /// token 1: ORF identifier
130 string orf_name = *it++;
131
132 /// token 2: start position
133 TSeqPos start_pos = 0;
134 try {
135 start_pos = NStr::StringToInt(*it++);
136 start_pos -= 1;
137 }
138 catch (CException&) {
139 CNcbiOstrstream ostr;
140 ostr << "CGlimmerReader::ReadAnnot(): line "
141 << count << ": failed to identify start pos: " << line;
142 string msg = string(CNcbiOstrstreamToString(ostr));
143 ERR_POST_X(3, Error << msg);
144
145 ++errs;
146 if (errs > 5) {
147 NCBI_THROW(CException, eUnknown, msg);
148 } else {
149 continue;
150 }
151 }
152
153 /// token 3: stop position
154 TSeqPos stop_pos = 0;
155 try {
156 stop_pos = NStr::StringToInt(*it++);
157 stop_pos -= 1;
158 }
159 catch (CException&) {
160 CNcbiOstrstream ostr;
161 ostr << "CGlimmerReader::ReadAnnot(): line "
162 << count << ": failed to identify stop pos: " << line;
163 string msg = string(CNcbiOstrstreamToString(ostr));
164 ERR_POST_X(4, Error << msg);
165
166 ++errs;
167 if (errs > 5) {
168 NCBI_THROW(CException, eUnknown, msg);
169 } else {
170 continue;
171 }
172 }
173
174 /// stop may be less than start!
175
176 /// token 4: frame + strand
177 ENa_strand strand = eNa_strand_plus;
178 try {
179 int frame = NStr::StringToInt(*it++);
180 if (frame > 3 || frame < -3) {
181 NCBI_THROW(CException, eUnknown, "frame out of range");
182 }
183
184 if (frame < 0) {
185 strand = eNa_strand_minus;
186 }
187 }
188 catch (CException&) {
189 CNcbiOstrstream ostr;
190 ostr << "CGlimmerReader::ReadAnnot(): line "
191 << count << ": failed to identify frame: " << line;
192 string msg = string(CNcbiOstrstreamToString(ostr));
193 ERR_POST_X(5, Error << msg);
194
195 ++errs;
196 if (errs > 5) {
197 NCBI_THROW(CException, eUnknown, msg);
198 } else {
199 continue;
200 }
201 }
202
203 /// token 5: score
204 //double score = 0;
205 try {
206 /*score =*/ NStr::StringToDouble(*it++);
207 }
208 catch (CException&) {
209 CNcbiOstrstream ostr;
210 ostr << "CGlimmerReader::ReadAnnot(): line "
211 << count << ": failed to identify score: " << line;
212 string msg = string(CNcbiOstrstreamToString(ostr));
213 ERR_POST_X(6, Error << msg);
214
215 ++errs;
216 if (errs > 5) {
217 NCBI_THROW(CException, eUnknown, msg);
218 } else {
219 continue;
220 }
221 }
222
223 ///
224 /// build our features
225 ///
226
227 /// CDS feat
228 CRef<CSeq_feat> cds_feat(new CSeq_feat());
229 if (strand == eNa_strand_plus && start_pos > stop_pos) {
230 /// circular cds_feature; make two intervals
231 CRef<CSeq_interval> ival;
232
233 ival.Reset(new CSeq_interval);
234 ival->SetFrom(start_pos);
235 ival->SetTo (seq_length - 1);
236 cds_feat->SetLocation().SetPacked_int().Set().push_back(ival);
237
238 ival.Reset(new CSeq_interval);
239 ival->SetFrom(0);
240 ival->SetTo (stop_pos);
241 cds_feat->SetLocation().SetPacked_int().Set().push_back(ival);
242
243 } else if (strand == eNa_strand_minus && start_pos < stop_pos) {
244 /// circular cds_feature; make two intervals
245 CRef<CSeq_interval> ival;
246
247 ival.Reset(new CSeq_interval);
248 ival->SetFrom(0);
249 ival->SetTo (start_pos);
250 cds_feat->SetLocation().SetPacked_int().Set().push_back(ival);
251
252 ival.Reset(new CSeq_interval);
253 ival->SetFrom(stop_pos);
254 ival->SetTo (seq_length - 1);
255 cds_feat->SetLocation().SetPacked_int().Set().push_back(ival);
256
257 } else {
258 cds_feat->SetLocation().SetInt().SetFrom(min(start_pos, stop_pos));
259 cds_feat->SetLocation().SetInt().SetTo (max(start_pos, stop_pos));
260 }
261 cds_feat->SetLocation().SetStrand(strand);
262 cds_feat->SetLocation().SetId(*idh.GetSeqId());
263
264 CCdregion& cdr = cds_feat->SetData().SetCdregion();
265 if (genetic_code_idx) {
266 CRef<CGenetic_code::C_E> d(new CGenetic_code::C_E);
267 d->SetId(genetic_code_idx);
268 cdr.SetCode().Set().push_back(d);
269 }
270
271 CRef<CSeq_feat> gene_feat(new CSeq_feat);
272 gene_feat->SetData().SetGene().SetLocus(orf_name);
273 gene_feat->SetLocation().Assign(cds_feat->GetLocation());
274
275 annot->SetData().SetFtable().push_back(gene_feat);
276 annot->SetData().SetFtable().push_back(cds_feat);
277 }
278 }
279 LOG_POST_X(7, Info << "CGlimmerReader::Read(): parsed " << count << " lines, " << errs << " errors");
280
281 string prefix("lcl|prot");
282 count = 0;
283 NON_CONST_ITERATE (CSeq_annot::TData::TFtable, it, annot->SetData().SetFtable()) {
284 CSeq_feat& feat = **it;
285 if (feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_cdregion) {
286 continue;
287 }
288
289 CRef<CSeq_entry> sub_entry(new CSeq_entry);
290 CBioseq& bioseq = sub_entry->SetSeq();
291
292 /// establish our inst
293 CSeq_inst& inst = bioseq.SetInst();
294 CSeqTranslator::Translate(**it, scope,
295 inst.SetSeq_data().SetIupacaa().Set(),
296 false /* trim trailing stop */);
297 inst.SetRepr(CSeq_inst::eRepr_raw);
298 inst.SetMol(CSeq_inst::eMol_aa);
299 inst.SetLength(inst.SetSeq_data().SetIupacaa().Set().size());
300
301 /// create a readable seq-id
302 CNcbiOstrstream ostr;
303 ostr << prefix << setw(7) << setfill('0') << ++count;
304 string id_str = string(CNcbiOstrstreamToString(ostr));
305
306 CRef<CSeq_id> id(new CSeq_id(id_str));
307 bioseq.SetId().push_back(id);
308
309 /// set the product on the feature
310 feat.SetProduct().SetWhole().Assign(*id);
311
312 /// save our bioseq
313 /// this is done last to preserve our data in a serializable form
314 entry->SetSet().SetSeq_set().push_back(sub_entry);
315 }
316
317 return entry;
318 }
319
320
321 END_NCBI_SCOPE
322