1 /*  $Id: ucscregion_reader.cpp 632526 2021-06-02 17:25:01Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Sergiy Gotvyanskyy
27  *
28  * File Description:
29  *   Distance matrix readers (UCSC-style Region format)
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbithr.hpp>
37 #include <corelib/ncbiutil.hpp>
38 #include <corelib/ncbiexpt.hpp>
39 #include <corelib/stream_utils.hpp>
40 
41 #include <util/static_map.hpp>
42 #include <util/line_reader.hpp>
43 
44 #include <serial/iterator.hpp>
45 #include <serial/objistrasn.hpp>
46 
47 // Objects includes
48 #include <objects/general/Int_fuzz.hpp>
49 #include <objects/general/Object_id.hpp>
50 #include <objects/general/User_object.hpp>
51 #include <objects/general/User_field.hpp>
52 #include <objects/general/Dbtag.hpp>
53 
54 #include <objects/seqloc/Seq_id.hpp>
55 #include <objects/seqloc/Seq_loc.hpp>
56 #include <objects/seqloc/Seq_interval.hpp>
57 #include <objects/seqloc/Seq_point.hpp>
58 #include <objects/seqfeat/SeqFeatXref.hpp>
59 
60 #include <objects/seqset/Seq_entry.hpp>
61 #include <objects/seq/Seq_annot.hpp>
62 #include <objects/seq/Annotdesc.hpp>
63 #include <objects/seq/Seqdesc.hpp>
64 #include <objects/seq/Annot_descr.hpp>
65 #include <objects/seqfeat/SeqFeatData.hpp>
66 
67 #include <objects/seqfeat/Seq_feat.hpp>
68 #include <objects/seqfeat/BioSource.hpp>
69 #include <objects/seqfeat/Org_ref.hpp>
70 #include <objects/seqfeat/OrgName.hpp>
71 #include <objects/seqfeat/SubSource.hpp>
72 #include <objects/seqfeat/OrgMod.hpp>
73 #include <objects/seqfeat/Gene_ref.hpp>
74 #include <objects/seqfeat/Cdregion.hpp>
75 #include <objects/seqfeat/Code_break.hpp>
76 #include <objects/seqfeat/Genetic_code.hpp>
77 #include <objects/seqfeat/Genetic_code_table.hpp>
78 #include <objects/seqfeat/RNA_ref.hpp>
79 #include <objects/seqfeat/Trna_ext.hpp>
80 #include <objects/seqfeat/Imp_feat.hpp>
81 #include <objects/seqfeat/Gb_qual.hpp>
82 #include <objects/seqfeat/Feat_id.hpp>
83 
84 #include <objtools/readers/read_util.hpp>
85 #include <objtools/readers/reader_exception.hpp>
86 #include <objtools/readers/line_error.hpp>
87 #include <objtools/readers/message_listener.hpp>
88 #include <objtools/readers/ucscregion_reader.hpp>
89 #include <objtools/error_codes.hpp>
90 
91 #include <algorithm>
92 
93 
94 #define NCBI_USE_ERRCODE_X   Objtools_Rd_RepMask
95 
96 BEGIN_NCBI_SCOPE
97 BEGIN_objects_SCOPE // namespace ncbi::objects::
98 
CUCSCRegionReader(unsigned int iflags)99 CUCSCRegionReader::CUCSCRegionReader(unsigned int iflags):
100   CReaderBase(iflags)
101 {
102 }
103 
~CUCSCRegionReader()104 CUCSCRegionReader::~CUCSCRegionReader()
105 {
106 }
107 
xParseFeatureUCSCFormat(const string & Line,int Number)108 CRef<objects::CSeq_feat> CUCSCRegionReader::xParseFeatureUCSCFormat(const string& Line, int Number)
109 {
110     CRef<CSeq_feat> Feat(new CSeq_feat);
111 #if 0
112     vector<string> Tokens;
113     string Delims = ".:- \t";
114     {{
115         size_t IdSubI = 0;
116         for(int I = 0; I < Line.length(); I++) {
117             CSeq_id::EAccessionInfo Info = CSeq_id::IdentifyAccession(Line.substr(0, I));
118             if(Info > 0) {
119                 IdSubI = I;
120             }
121         }
122 
123         if(IdSubI > 0) {
124             string IdStr = Line.substr(0, IdSubI);
125             string SubLine = Line.substr(IdSubI );
126             //cerr << IdStr << endl;
127             //cerr << SubLine << endl;
128             NStr::Split(NStr::TruncateSpaces(SubLine), Delims, Tokens, NStr::fSplit_Tokenize);
129             Tokens.insert(Tokens.begin(), IdStr);
130         } else {
131             NStr::Split(NStr::TruncateSpaces(Line), Delims, Tokens, NStr::fSplit_Tokenize);
132         }
133     }}
134 
135     //ITERATE(vector<string>, iter, Tokens) {
136     //  cerr << "\t" << *iter << endl;
137     //}
138 
139     if(Tokens.size() < K_START+1)
140         return CRef<CSeq_feat>();
141 
142     NStr::ReplaceInPlace(Tokens[K_START], ",", "");
143     NStr::ReplaceInPlace(Tokens[K_START], ".", "");
144     if(Tokens.size() >= K_STOP+1) {
145         NStr::ReplaceInPlace(Tokens[K_STOP], ",", "");
146         NStr::ReplaceInPlace(Tokens[K_STOP], ".", "");
147     }
148 
149     Feat->SetData().SetRegion("region: "+NStr::IntToString(Number));
150 
151     CRef<CSeq_loc> TopLoc(new CSeq_loc);
152     TopLoc->SetInt().SetId().Assign(*CRegionFile::x_ParseId(Tokens[K_ID]));
153     TopLoc->SetInt().SetFrom() = NStr::StringToUInt8(Tokens[K_START])-1;
154     if(Tokens.size() >= K_STOP+1)
155         TopLoc->SetInt().SetTo() = NStr::StringToUInt8(Tokens[K_STOP])-1;
156     else
157         TopLoc->SetInt().SetTo() = TopLoc->GetInt().GetFrom();
158 
159     if(Tokens.size() >= K_STRAND+1) {
160         if(Tokens[K_STRAND] == "+")
161             TopLoc->SetInt().SetStrand() = eNa_strand_plus;
162         else if(Tokens[K_STRAND] == "-")
163             TopLoc->SetInt().SetStrand() = eNa_strand_minus;
164         else
165             TopLoc->SetInt().SetStrand() = eNa_strand_plus;
166     } else {
167         TopLoc->SetInt().SetStrand() = eNa_strand_plus;
168     }
169     Feat->SetLocation().Assign(*TopLoc);
170 
171     if(!Feat->CanGetTitle())
172         Feat->SetTitle() = "Line:"+NStr::IntToString(Number);
173 
174 //cerr << MSerial_AsnText << *Feat;
175 #endif
176 
177     return Feat;
178 }
179 //  ----------------------------------------------------------------------------
xSmartFieldSplit(vector<string> & fields,CTempString line)180 void CUCSCRegionReader::xSmartFieldSplit(vector<string>& fields, CTempString line)
181 {
182     NStr::Split(line, " \t.-:", fields, NStr::fSplit_Tokenize);
183     if (line[line.length()-1] == '-')
184         fields.push_back("-");
185     while (fields.size() > 3)
186     {
187         if (fields.size() == 4 && (fields.back() == "+" || fields.back() == "-"))
188             break;
189         // try to merge first column
190         size_t len = fields[0].length();
191         if (line[len] == '.')
192         {
193             fields[0] += line[len];
194             fields[0] += fields[1];
195             fields.erase(fields.begin()+1);
196         } else {
197             break;
198         }
199     }
200 }
201 //  ----------------------------------------------------------------------------
xParseFeature(const vector<string> & fields,CSeq_annot & annot,ILineErrorListener * pEC)202 bool CUCSCRegionReader::xParseFeature(
203     const vector<string>& fields,
204     CSeq_annot& annot,
205     ILineErrorListener* pEC)
206 {
207     //  assign
208     string str_line_number = NStr::IntToString(m_uLineNumber);
209     CSeq_annot::C_Data::TFtable& ftable = annot.SetData().SetFtable();
210     CRef<CSeq_feat> feature;
211     feature.Reset( new CSeq_feat );
212     try {
213         x_SetFeatureLocation(feature, fields);
214         feature->SetData().SetRegion() = "region: "+ str_line_number;
215         if(!feature->CanGetTitle())
216             feature->SetTitle() = "Line:" + str_line_number;
217     }
218     catch(CObjReaderLineException& err) {
219         ProcessError(err, pEC);
220         return false;
221     }
222     ftable.push_back( feature );
223     return true;
224 }
225 //  ----------------------------------------------------------------------------
x_SetFeatureLocation(CRef<CSeq_feat> & feature,const vector<string> & fields)226 void CUCSCRegionReader::x_SetFeatureLocation(
227     CRef<CSeq_feat>& feature,
228     const vector<string>& fields )
229 //  ----------------------------------------------------------------------------
230 {
231     //
232     //  Note:
233     //  BED convention for specifying intervals is 0-based, first in, first out.
234     //  ASN convention for specifying intervals is 0-based, first in, last in.
235     //  Hence, conversion BED->ASN  leaves the first leaves the "from" coordinate
236     //  unchanged, and decrements the "to" coordinate by one.
237     //
238 
239     CRef<CSeq_loc> location(new CSeq_loc);
240     int from, to;
241     from = to = -1;
242 
243     //already established: We got at least three columns
244     try {
245         from = NStr::StringToInt(fields[1], NStr::fAllowCommas)-1;
246     }
247     catch(std::exception&) {
248         AutoPtr<CObjReaderLineException> pErr(
249             CObjReaderLineException::Create(
250             eDiag_Error,
251             m_uLineNumber,
252             "Invalid data line: Bad \"SeqStart\" value." ) );
253         pErr->Throw();
254     }
255     to = from;
256 
257     if (fields.size()>2)
258     try {
259         to = NStr::StringToInt(fields[2], NStr::fAllowCommas) - 1;
260     }
261     catch(std::exception&) {
262         AutoPtr<CObjReaderLineException> pErr(
263             CObjReaderLineException::Create(
264             eDiag_Error,
265             m_uLineNumber,
266             "Invalid data line: Bad \"SeqStop\" value.") );
267         pErr->Throw();
268     }
269 
270     if (from == to) {
271         location->SetPnt().SetPoint(from);
272     }
273     else if (from < to) {
274         location->SetInt().SetFrom(from);
275         location->SetInt().SetTo(to);
276     }
277     else {
278         AutoPtr<CObjReaderLineException> pErr(
279             CObjReaderLineException::Create(
280             eDiag_Error,
281             m_uLineNumber,
282             "Invalid data line: \"SeqStop\" less than \"SeqStart\"." ) );
283         pErr->Throw();
284     }
285 
286     size_t strand_field = 3;
287     if (strand_field < fields.size()) {
288         string strand = fields[strand_field];
289         if (strand != "+"  &&  strand != "-"  &&  strand != ".") {
290             AutoPtr<CObjReaderLineException> pErr(
291                 CObjReaderLineException::Create(
292                 eDiag_Error,
293                 m_uLineNumber,
294                 "Invalid data line: Invalid strand character." ) );
295             pErr->Throw();
296         }
297         location->SetStrand(( fields[strand_field] == "+" ) ?
298                            eNa_strand_plus : eNa_strand_minus );
299     }
300     try
301     {
302         CRef<CSeq_id> id = CReadUtil::AsSeqId(fields[0], m_iFlags, false);
303         //CRef<CSeq_id> id (new CSeq_id(fields[0], CSeq_id::fParse_AnyRaw | m_iFlags));
304         location->SetId(*id);
305         feature->SetLocation(*location);
306     }
307     catch(CSeqIdException&)
308     {
309         AutoPtr<CObjReaderLineException> pErr(
310             CObjReaderLineException::Create(
311             eDiag_Error,
312             m_uLineNumber,
313             "Malformed sequence id:" ) );
314         pErr->Throw();
315     }
316 }
317 
318 
319 //  ----------------------------------------------------------------------------
ReadObject(ILineReader & lr,ILineErrorListener * pErrors)320 CRef<CSerialObject> CUCSCRegionReader::ReadObject(ILineReader& lr, ILineErrorListener* pErrors)
321 {
322     CRef<CSeq_annot> annot = ReadSeqAnnot(lr, pErrors);
323     return CRef<CSerialObject>(annot);
324 }
325 
326 
327 //  ----------------------------------------------------------------------------
ReadSeqAnnot(ILineReader & lr,ILineErrorListener * pEC)328 CRef<CSeq_annot> CUCSCRegionReader::ReadSeqAnnot(
329     ILineReader& lr,
330     ILineErrorListener* pEC )
331 {
332     const size_t MAX_RECORDS = 100000;
333 
334     CRef<CSeq_annot> pAnnot;
335     CRef<CAnnot_descr> desc;
336 
337     pAnnot.Reset(new CSeq_annot);
338     desc.Reset(new CAnnot_descr);
339     pAnnot->SetDesc(*desc);
340     CSeq_annot::C_Data::TFtable& tbl = pAnnot->SetData().SetFtable();
341 
342     int featureCount = 0;
343 
344     while (!lr.AtEOF()) {
345 
346         ++m_uLineNumber;
347 
348         CTempString line = *++lr;
349 
350         if (NStr::TruncateSpaces_Unsafe(line).empty()) {
351             continue;
352         }
353         if (xParseComment(line, pAnnot)) {
354             continue;
355         }
356         CTempString record_copy = NStr::TruncateSpaces_Unsafe(line);
357 
358         //  parse
359         vector<string> fields;
360         xSmartFieldSplit(fields, record_copy);
361         if (xParseFeature(fields, *pAnnot, pEC)) {
362             ++featureCount;
363             continue;
364         }
365         if (tbl.size() >= MAX_RECORDS) {
366             break;
367         }
368     }
369     //  Only return a valid object if there was at least one feature
370     if (0 == featureCount) {
371         return CRef<CSeq_annot>();
372     }
373     return pAnnot;
374 }
375 
376 
377 END_objects_SCOPE
378 END_NCBI_SCOPE
379