1 /* $Id: ucscregion_reader.cpp 632526 2021-06-02 17:25:01Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Sergiy Gotvyanskyy
27 *
28 * File Description:
29 * Distance matrix readers (UCSC-style Region format)
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbithr.hpp>
37 #include <corelib/ncbiutil.hpp>
38 #include <corelib/ncbiexpt.hpp>
39 #include <corelib/stream_utils.hpp>
40
41 #include <util/static_map.hpp>
42 #include <util/line_reader.hpp>
43
44 #include <serial/iterator.hpp>
45 #include <serial/objistrasn.hpp>
46
47 // Objects includes
48 #include <objects/general/Int_fuzz.hpp>
49 #include <objects/general/Object_id.hpp>
50 #include <objects/general/User_object.hpp>
51 #include <objects/general/User_field.hpp>
52 #include <objects/general/Dbtag.hpp>
53
54 #include <objects/seqloc/Seq_id.hpp>
55 #include <objects/seqloc/Seq_loc.hpp>
56 #include <objects/seqloc/Seq_interval.hpp>
57 #include <objects/seqloc/Seq_point.hpp>
58 #include <objects/seqfeat/SeqFeatXref.hpp>
59
60 #include <objects/seqset/Seq_entry.hpp>
61 #include <objects/seq/Seq_annot.hpp>
62 #include <objects/seq/Annotdesc.hpp>
63 #include <objects/seq/Seqdesc.hpp>
64 #include <objects/seq/Annot_descr.hpp>
65 #include <objects/seqfeat/SeqFeatData.hpp>
66
67 #include <objects/seqfeat/Seq_feat.hpp>
68 #include <objects/seqfeat/BioSource.hpp>
69 #include <objects/seqfeat/Org_ref.hpp>
70 #include <objects/seqfeat/OrgName.hpp>
71 #include <objects/seqfeat/SubSource.hpp>
72 #include <objects/seqfeat/OrgMod.hpp>
73 #include <objects/seqfeat/Gene_ref.hpp>
74 #include <objects/seqfeat/Cdregion.hpp>
75 #include <objects/seqfeat/Code_break.hpp>
76 #include <objects/seqfeat/Genetic_code.hpp>
77 #include <objects/seqfeat/Genetic_code_table.hpp>
78 #include <objects/seqfeat/RNA_ref.hpp>
79 #include <objects/seqfeat/Trna_ext.hpp>
80 #include <objects/seqfeat/Imp_feat.hpp>
81 #include <objects/seqfeat/Gb_qual.hpp>
82 #include <objects/seqfeat/Feat_id.hpp>
83
84 #include <objtools/readers/read_util.hpp>
85 #include <objtools/readers/reader_exception.hpp>
86 #include <objtools/readers/line_error.hpp>
87 #include <objtools/readers/message_listener.hpp>
88 #include <objtools/readers/ucscregion_reader.hpp>
89 #include <objtools/error_codes.hpp>
90
91 #include <algorithm>
92
93
94 #define NCBI_USE_ERRCODE_X Objtools_Rd_RepMask
95
96 BEGIN_NCBI_SCOPE
97 BEGIN_objects_SCOPE // namespace ncbi::objects::
98
CUCSCRegionReader(unsigned int iflags)99 CUCSCRegionReader::CUCSCRegionReader(unsigned int iflags):
100 CReaderBase(iflags)
101 {
102 }
103
~CUCSCRegionReader()104 CUCSCRegionReader::~CUCSCRegionReader()
105 {
106 }
107
xParseFeatureUCSCFormat(const string & Line,int Number)108 CRef<objects::CSeq_feat> CUCSCRegionReader::xParseFeatureUCSCFormat(const string& Line, int Number)
109 {
110 CRef<CSeq_feat> Feat(new CSeq_feat);
111 #if 0
112 vector<string> Tokens;
113 string Delims = ".:- \t";
114 {{
115 size_t IdSubI = 0;
116 for(int I = 0; I < Line.length(); I++) {
117 CSeq_id::EAccessionInfo Info = CSeq_id::IdentifyAccession(Line.substr(0, I));
118 if(Info > 0) {
119 IdSubI = I;
120 }
121 }
122
123 if(IdSubI > 0) {
124 string IdStr = Line.substr(0, IdSubI);
125 string SubLine = Line.substr(IdSubI );
126 //cerr << IdStr << endl;
127 //cerr << SubLine << endl;
128 NStr::Split(NStr::TruncateSpaces(SubLine), Delims, Tokens, NStr::fSplit_Tokenize);
129 Tokens.insert(Tokens.begin(), IdStr);
130 } else {
131 NStr::Split(NStr::TruncateSpaces(Line), Delims, Tokens, NStr::fSplit_Tokenize);
132 }
133 }}
134
135 //ITERATE(vector<string>, iter, Tokens) {
136 // cerr << "\t" << *iter << endl;
137 //}
138
139 if(Tokens.size() < K_START+1)
140 return CRef<CSeq_feat>();
141
142 NStr::ReplaceInPlace(Tokens[K_START], ",", "");
143 NStr::ReplaceInPlace(Tokens[K_START], ".", "");
144 if(Tokens.size() >= K_STOP+1) {
145 NStr::ReplaceInPlace(Tokens[K_STOP], ",", "");
146 NStr::ReplaceInPlace(Tokens[K_STOP], ".", "");
147 }
148
149 Feat->SetData().SetRegion("region: "+NStr::IntToString(Number));
150
151 CRef<CSeq_loc> TopLoc(new CSeq_loc);
152 TopLoc->SetInt().SetId().Assign(*CRegionFile::x_ParseId(Tokens[K_ID]));
153 TopLoc->SetInt().SetFrom() = NStr::StringToUInt8(Tokens[K_START])-1;
154 if(Tokens.size() >= K_STOP+1)
155 TopLoc->SetInt().SetTo() = NStr::StringToUInt8(Tokens[K_STOP])-1;
156 else
157 TopLoc->SetInt().SetTo() = TopLoc->GetInt().GetFrom();
158
159 if(Tokens.size() >= K_STRAND+1) {
160 if(Tokens[K_STRAND] == "+")
161 TopLoc->SetInt().SetStrand() = eNa_strand_plus;
162 else if(Tokens[K_STRAND] == "-")
163 TopLoc->SetInt().SetStrand() = eNa_strand_minus;
164 else
165 TopLoc->SetInt().SetStrand() = eNa_strand_plus;
166 } else {
167 TopLoc->SetInt().SetStrand() = eNa_strand_plus;
168 }
169 Feat->SetLocation().Assign(*TopLoc);
170
171 if(!Feat->CanGetTitle())
172 Feat->SetTitle() = "Line:"+NStr::IntToString(Number);
173
174 //cerr << MSerial_AsnText << *Feat;
175 #endif
176
177 return Feat;
178 }
179 // ----------------------------------------------------------------------------
xSmartFieldSplit(vector<string> & fields,CTempString line)180 void CUCSCRegionReader::xSmartFieldSplit(vector<string>& fields, CTempString line)
181 {
182 NStr::Split(line, " \t.-:", fields, NStr::fSplit_Tokenize);
183 if (line[line.length()-1] == '-')
184 fields.push_back("-");
185 while (fields.size() > 3)
186 {
187 if (fields.size() == 4 && (fields.back() == "+" || fields.back() == "-"))
188 break;
189 // try to merge first column
190 size_t len = fields[0].length();
191 if (line[len] == '.')
192 {
193 fields[0] += line[len];
194 fields[0] += fields[1];
195 fields.erase(fields.begin()+1);
196 } else {
197 break;
198 }
199 }
200 }
201 // ----------------------------------------------------------------------------
xParseFeature(const vector<string> & fields,CSeq_annot & annot,ILineErrorListener * pEC)202 bool CUCSCRegionReader::xParseFeature(
203 const vector<string>& fields,
204 CSeq_annot& annot,
205 ILineErrorListener* pEC)
206 {
207 // assign
208 string str_line_number = NStr::IntToString(m_uLineNumber);
209 CSeq_annot::C_Data::TFtable& ftable = annot.SetData().SetFtable();
210 CRef<CSeq_feat> feature;
211 feature.Reset( new CSeq_feat );
212 try {
213 x_SetFeatureLocation(feature, fields);
214 feature->SetData().SetRegion() = "region: "+ str_line_number;
215 if(!feature->CanGetTitle())
216 feature->SetTitle() = "Line:" + str_line_number;
217 }
218 catch(CObjReaderLineException& err) {
219 ProcessError(err, pEC);
220 return false;
221 }
222 ftable.push_back( feature );
223 return true;
224 }
225 // ----------------------------------------------------------------------------
x_SetFeatureLocation(CRef<CSeq_feat> & feature,const vector<string> & fields)226 void CUCSCRegionReader::x_SetFeatureLocation(
227 CRef<CSeq_feat>& feature,
228 const vector<string>& fields )
229 // ----------------------------------------------------------------------------
230 {
231 //
232 // Note:
233 // BED convention for specifying intervals is 0-based, first in, first out.
234 // ASN convention for specifying intervals is 0-based, first in, last in.
235 // Hence, conversion BED->ASN leaves the first leaves the "from" coordinate
236 // unchanged, and decrements the "to" coordinate by one.
237 //
238
239 CRef<CSeq_loc> location(new CSeq_loc);
240 int from, to;
241 from = to = -1;
242
243 //already established: We got at least three columns
244 try {
245 from = NStr::StringToInt(fields[1], NStr::fAllowCommas)-1;
246 }
247 catch(std::exception&) {
248 AutoPtr<CObjReaderLineException> pErr(
249 CObjReaderLineException::Create(
250 eDiag_Error,
251 m_uLineNumber,
252 "Invalid data line: Bad \"SeqStart\" value." ) );
253 pErr->Throw();
254 }
255 to = from;
256
257 if (fields.size()>2)
258 try {
259 to = NStr::StringToInt(fields[2], NStr::fAllowCommas) - 1;
260 }
261 catch(std::exception&) {
262 AutoPtr<CObjReaderLineException> pErr(
263 CObjReaderLineException::Create(
264 eDiag_Error,
265 m_uLineNumber,
266 "Invalid data line: Bad \"SeqStop\" value.") );
267 pErr->Throw();
268 }
269
270 if (from == to) {
271 location->SetPnt().SetPoint(from);
272 }
273 else if (from < to) {
274 location->SetInt().SetFrom(from);
275 location->SetInt().SetTo(to);
276 }
277 else {
278 AutoPtr<CObjReaderLineException> pErr(
279 CObjReaderLineException::Create(
280 eDiag_Error,
281 m_uLineNumber,
282 "Invalid data line: \"SeqStop\" less than \"SeqStart\"." ) );
283 pErr->Throw();
284 }
285
286 size_t strand_field = 3;
287 if (strand_field < fields.size()) {
288 string strand = fields[strand_field];
289 if (strand != "+" && strand != "-" && strand != ".") {
290 AutoPtr<CObjReaderLineException> pErr(
291 CObjReaderLineException::Create(
292 eDiag_Error,
293 m_uLineNumber,
294 "Invalid data line: Invalid strand character." ) );
295 pErr->Throw();
296 }
297 location->SetStrand(( fields[strand_field] == "+" ) ?
298 eNa_strand_plus : eNa_strand_minus );
299 }
300 try
301 {
302 CRef<CSeq_id> id = CReadUtil::AsSeqId(fields[0], m_iFlags, false);
303 //CRef<CSeq_id> id (new CSeq_id(fields[0], CSeq_id::fParse_AnyRaw | m_iFlags));
304 location->SetId(*id);
305 feature->SetLocation(*location);
306 }
307 catch(CSeqIdException&)
308 {
309 AutoPtr<CObjReaderLineException> pErr(
310 CObjReaderLineException::Create(
311 eDiag_Error,
312 m_uLineNumber,
313 "Malformed sequence id:" ) );
314 pErr->Throw();
315 }
316 }
317
318
319 // ----------------------------------------------------------------------------
ReadObject(ILineReader & lr,ILineErrorListener * pErrors)320 CRef<CSerialObject> CUCSCRegionReader::ReadObject(ILineReader& lr, ILineErrorListener* pErrors)
321 {
322 CRef<CSeq_annot> annot = ReadSeqAnnot(lr, pErrors);
323 return CRef<CSerialObject>(annot);
324 }
325
326
327 // ----------------------------------------------------------------------------
ReadSeqAnnot(ILineReader & lr,ILineErrorListener * pEC)328 CRef<CSeq_annot> CUCSCRegionReader::ReadSeqAnnot(
329 ILineReader& lr,
330 ILineErrorListener* pEC )
331 {
332 const size_t MAX_RECORDS = 100000;
333
334 CRef<CSeq_annot> pAnnot;
335 CRef<CAnnot_descr> desc;
336
337 pAnnot.Reset(new CSeq_annot);
338 desc.Reset(new CAnnot_descr);
339 pAnnot->SetDesc(*desc);
340 CSeq_annot::C_Data::TFtable& tbl = pAnnot->SetData().SetFtable();
341
342 int featureCount = 0;
343
344 while (!lr.AtEOF()) {
345
346 ++m_uLineNumber;
347
348 CTempString line = *++lr;
349
350 if (NStr::TruncateSpaces_Unsafe(line).empty()) {
351 continue;
352 }
353 if (xParseComment(line, pAnnot)) {
354 continue;
355 }
356 CTempString record_copy = NStr::TruncateSpaces_Unsafe(line);
357
358 // parse
359 vector<string> fields;
360 xSmartFieldSplit(fields, record_copy);
361 if (xParseFeature(fields, *pAnnot, pEC)) {
362 ++featureCount;
363 continue;
364 }
365 if (tbl.size() >= MAX_RECORDS) {
366 break;
367 }
368 }
369 // Only return a valid object if there was at least one feature
370 if (0 == featureCount) {
371 return CRef<CSeq_annot>();
372 }
373 return pAnnot;
374 }
375
376
377 END_objects_SCOPE
378 END_NCBI_SCOPE
379