1 /*  $Id: fasta_reader_utils.cpp 634226 2021-07-06 20:04:10Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors:  Justin Foley
27 *
28 * File Description:
29 *   Reader for FASTA-format definition lines. Based on code
30 *   originally contained in CFastaReader.
31 *
32 * ===========================================================================
33 */
34 
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbidiag.hpp>
37 #include <objtools/error_codes.hpp>
38 #include <objtools/readers/fasta.hpp>
39 #include <objects/general/Dbtag.hpp>
40 #include <objects/general/Object_id.hpp>
41 #include <objtools/readers/fasta_reader_utils.hpp>
42 #include <objtools/readers/seqid_validate.hpp>
43 
44 #define NCBI_USE_ERRCODE_X Objtools_Rd_Fasta // Will need to change this
45 
46 BEGIN_NCBI_SCOPE
47 BEGIN_SCOPE(objects)
48 
49 size_t CFastaDeflineReader::s_MaxLocalIDLength = CSeq_id::kMaxLocalIDLength;
50 size_t CFastaDeflineReader::s_MaxGeneralTagLength = CSeq_id::kMaxGeneralTagLength;
51 size_t CFastaDeflineReader::s_MaxAccessionLength = CSeq_id::kMaxAccessionLength;
52 
53 
54 
s_PostError(ILineErrorListener * pMessageListener,const TSeqPos lineNumber,const string & idString,const string & errMessage,const CObjReaderLineException::EProblem problem,const CObjReaderParseException::EErrCode errCode)55 static void s_PostError(ILineErrorListener* pMessageListener,
56     const TSeqPos lineNumber,
57     const string& idString,
58     const string& errMessage,
59     const CObjReaderLineException::EProblem problem,
60     const CObjReaderParseException::EErrCode errCode)
61 {
62     if (pMessageListener) {
63         unique_ptr<CObjReaderLineException> pLineExpt(
64             CObjReaderLineException::Create(
65             eDiag_Error,
66             lineNumber,
67             errMessage,
68             problem,
69             idString, "", "", "",
70             errCode));
71 
72         if (pMessageListener->PutError(*pLineExpt)) {
73             return;
74         }
75     }
76 
77     throw CObjReaderParseException(DIAG_COMPILE_INFO,
78             0,
79             errCode,
80             errMessage,
81             lineNumber,
82             eDiag_Error);
83 }
84 
s_PostWarning(ILineErrorListener * pMessageListener,const TSeqPos lineNumber,const string & idString,const string & errMessage,const CObjReaderLineException::EProblem problem,const CObjReaderParseException::EErrCode errCode)85 static void s_PostWarning(ILineErrorListener* pMessageListener,
86     const TSeqPos lineNumber,
87     const string& idString,
88     const string& errMessage,
89     const CObjReaderLineException::EProblem problem,
90     const CObjReaderParseException::EErrCode errCode)
91 {
92     unique_ptr<CObjReaderLineException> pLineExpt(
93         CObjReaderLineException::Create(
94         eDiag_Warning,
95         lineNumber,
96         errMessage,
97         problem,
98         idString, "", "", "",
99         errCode));
100 
101     if (!pMessageListener) {
102         LOG_POST_X(1, Warning << pLineExpt->Message());
103         return;
104     }
105 
106     if (!pMessageListener->PutError(*pLineExpt)) {
107         throw CObjReaderParseException(DIAG_COMPILE_INFO,
108                 0,
109                 errCode,
110                 errMessage,
111                 lineNumber,
112                 eDiag_Warning);
113     }
114 }
115 
116 // For reasons of efficiency, this method does not use CRef<CSeq_interval> to access range
117 // information - RW-26
ParseDefline(const CTempString & defline,const SDeflineParseInfo & info,const TIgnoredProblems & ignored_errors,TIds & ids,bool & has_range,TSeqPos & range_start,TSeqPos & range_end,TSeqTitles & titles,ILineErrorListener * pMessageListener)118 void CFastaDeflineReader::ParseDefline(const CTempString& defline,
119     const SDeflineParseInfo& info,
120     const TIgnoredProblems& ignored_errors,
121     TIds& ids,
122     bool& has_range,
123     TSeqPos& range_start,
124     TSeqPos& range_end,
125     TSeqTitles& titles,
126     ILineErrorListener* pMessageListener)
127 {
128     SDeflineData data;
129     ParseDefline(defline, info, data, pMessageListener);
130     has_range   = data.has_range;
131     range_start = data.range_start;
132     range_end   = data.range_end;
133     titles  = move(data.titles);
134 }
135 
ParseDefline(const CTempString & defline,const SDeflineParseInfo & info,SDeflineData & data,ILineErrorListener * pMessageListener)136 void CFastaDeflineReader::ParseDefline(const CTempString& defline,
137         const SDeflineParseInfo& info,
138         SDeflineData& data,
139         ILineErrorListener* pMessageListener)
140 {
141     static CSeqIdCheck fn_idcheck;
142     ParseDefline(defline, info, data, pMessageListener, fn_idcheck);
143 }
144 
145 
ParseDefline(const CTempString & defline,const SDeflineParseInfo & info,SDeflineData & data,ILineErrorListener * pMessageListener,FIdCheck fn_idcheck)146 void CFastaDeflineReader::ParseDefline(const CTempString& defline,
147         const SDeflineParseInfo& info,
148         SDeflineData& data,
149         ILineErrorListener* pMessageListener,
150         FIdCheck fn_idcheck)
151 {
152     size_t range_len = 0;
153     const TFastaFlags& fFastaFlags = info.fFastaFlags;
154     const TSeqPos& lineNumber = info.lineNumber;
155     data.has_range = false;
156 
157     const size_t len = defline.length();
158     if (len <= 1 ||
159         NStr::IsBlank(defline.substr(1))) {
160         return;
161     }
162 
163     if (defline[0] != '>') {
164         NCBI_THROW2(CObjReaderParseException, eFormat,
165             "Invalid defline. First character is not '>'", 0);
166     }
167 
168     // ignore spaces between '>' and the sequence ID
169     size_t start;
170     for(start = 1 ; start < len; ++start ) {
171         if( ! isspace(defline[start]) ) {
172             break;
173         }
174     }
175 
176     size_t pos;
177     size_t title_start = NPOS;
178     if ((fFastaFlags & CFastaReader::fNoParseID)) {
179         title_start = start;
180     }
181     else
182     {
183         pos = start;
184         while (pos < len && defline[pos] > ' ') {
185             pos++;
186         }
187 
188         if ( ! (fFastaFlags & CFastaReader::fDisableParseRange) ) {
189             range_len = ParseRange(defline.substr(start, pos - start),
190                     data.range_start, data.range_end, pMessageListener);
191         }
192 
193         auto id_string = defline.substr(start, pos - start - range_len);
194         if (NStr::IsBlank(id_string)) {
195             NCBI_THROW2(CObjReaderParseException, eFormat,
196                 "Unable to locate sequence id in definition line", 0);
197         }
198 
199         title_start = pos;
200         x_ProcessIDs(id_string,
201                 info,
202                 data.ids,
203                 pMessageListener,
204                 fn_idcheck);
205 
206         data.has_range = (range_len>0);
207     }
208 
209     // trim leading whitespace from title (is this appropriate?)
210     while (title_start < len
211         &&  isspace((unsigned char)defline[title_start])) {
212         ++title_start;
213     }
214 
215     if (title_start < len) {
216         for (pos = title_start + 1;  pos < len;  ++pos) {
217             if ((unsigned char)defline[pos] < ' ') {
218             break;
219             }
220         }
221         // Parse the title elsewhere - after the molecule has been deduced
222         data.titles.push_back(
223             SLineTextAndLoc(
224                 defline.substr(title_start, pos - title_start), lineNumber));
225     }
226 }
227 
228 
ParseRange(const CTempString & s,TSeqPos & start,TSeqPos & end,ILineErrorListener * pMessageListener)229 TSeqPos CFastaDeflineReader::ParseRange(
230     const CTempString& s,
231     TSeqPos& start,
232     TSeqPos& end,
233     ILineErrorListener * pMessageListener)
234 {
235 
236     if (s.empty()) {
237         return 0;
238     }
239 
240     bool    on_start = false;
241     bool    negative = false;
242     TSeqPos mult = 1;
243     size_t  pos;
244     start = end = 0;
245     for (pos = s.length() - 1;  pos > 0;  --pos) {
246         unsigned char c = s[pos];
247         if (c >= '0'  &&  c <= '9') {
248             if (on_start) {
249                 start += (c - '0') * mult;
250             } else {
251                 end += (c - '0') * mult;
252             }
253             mult *= 10;
254         } else if (c == '-'  &&  !on_start  &&  mult > 1) {
255             on_start = true;
256             mult = 1;
257         } else if (c == ':'  &&  on_start  &&  mult > 1) {
258             break;
259         } else if (c == 'c'  &&  pos > 0  &&  s[--pos] == ':'
260                    &&  on_start  &&  mult > 1) {
261             negative = true;
262             break;
263         } else {
264             return 0; // syntax error
265         }
266     }
267     if ((negative ? (end > start) : (start > end))  ||  s[pos] != ':') {
268         return 0;
269     }
270     --start;
271     --end;
272     return TSeqPos(s.length() - pos);
273 }
274 
275 
276 class CIdErrorReporter
277 {
278 public:
279     CIdErrorReporter(ILineErrorListener* pMessageListener, bool ignoreGeneralParsingError=false);
280 
281     void operator() (EDiagSev severity,
282             int lineNum,
283             const string& idString,
284             CFastaIdValidate::EErrCode errCode,
285             const string& msg);
286 private:
287     using TCodePair = pair<CObjReaderLineException::EProblem, CObjReaderParseException::EErrCode>;
288     ILineErrorListener* m_pMessageListener = nullptr;
289     bool m_IgnoreGeneralParsingError=false;
290 };
291 
292 
293 
CIdErrorReporter(ILineErrorListener * pMessageListener,bool ignoreGeneralParsingError)294 CIdErrorReporter::CIdErrorReporter(ILineErrorListener* pMessageListener, bool ignoreGeneralParsingError) :
295     m_pMessageListener(pMessageListener), m_IgnoreGeneralParsingError(ignoreGeneralParsingError) {}
296 
297 
operator ()(EDiagSev severity,int lineNum,const string & idString,CFastaIdValidate::EErrCode errCode,const string & msg)298 void CIdErrorReporter::operator()(EDiagSev severity,
299         int lineNum,
300         const string& idString,
301         CFastaIdValidate::EErrCode errCode,
302         const string& msg)
303 {
304 
305     static map<CFastaIdValidate::EErrCode,TCodePair> s_CodeMap = /* replace with compile-time map */
306     {
307      {CFastaIdValidate::eIDTooLong,{ILineError::eProblem_GeneralParsingError, CObjReaderParseException::eIDTooLong}},
308      {CFastaIdValidate::eBadLocalID,{ILineError::eProblem_GeneralParsingError, CObjReaderParseException::eInvalidID}},
309      {CFastaIdValidate::eUnexpectedNucResidues,{ILineError::eProblem_UnexpectedNucResidues, CObjReaderParseException::eFormat}},
310      {CFastaIdValidate::eUnexpectedAminoAcids,{ILineError::eProblem_UnexpectedAminoAcids, CObjReaderParseException::eFormat}}
311     };
312 
313 
314     const auto cit = s_CodeMap.find(errCode);
315     _ASSERT(cit != s_CodeMap.end()); // convert this to a compile-time assertion
316 
317     const auto& problem = cit->second.first;
318     if (m_IgnoreGeneralParsingError &&
319         problem == ILineError::eProblem_GeneralParsingError) {
320         return;
321     }
322 
323     const auto& parseExceptionCode = cit->second.second;
324     if (severity == eDiag_Error) {
325         s_PostError(m_pMessageListener, lineNum, idString, msg, problem, parseExceptionCode);
326     }
327     else {
328         s_PostWarning(m_pMessageListener, lineNum, idString, msg, problem, parseExceptionCode);
329     }
330 }
331 
332 
x_ProcessIDs(const CTempString & id_string,const SDeflineParseInfo & info,TIds & ids,ILineErrorListener * pMessageListener,FIdCheck f_id_check)333 void CFastaDeflineReader::x_ProcessIDs(
334     const CTempString& id_string,
335     const SDeflineParseInfo& info,
336     TIds& ids,
337     ILineErrorListener* pMessageListener,
338     FIdCheck f_id_check
339     )
340 {
341     if (info.fBaseFlags & CReaderBase::fAllIdsAsLocal)
342     {
343         CRef<CSeq_id> pSeqId(new CSeq_id(CSeq_id::e_Local, id_string));
344         ids.push_back(pSeqId);
345         f_id_check(ids, info, pMessageListener);
346         return;
347     }
348 
349     CSeq_id::TParseFlags flags =
350         CSeq_id::fParse_PartialOK |
351         CSeq_id::fParse_AnyLocal;
352 
353     if (info.fFastaFlags & CFastaReader::fParseRawID) {
354         flags |= CSeq_id::fParse_RawText;
355     }
356 
357     string local_copy;
358     auto to_parse = id_string;
359     if (id_string.find(',') != NPOS &&
360         id_string.find('|') == NPOS) {
361         const string err_message =
362             "Near line " + NStr::NumericToString(info.lineNumber)
363             + ", the sequence id string contains 'comma' symbol, which has been replaced with 'underscore' "
364             + "symbol. Please correct the sequence id string.";
365 
366         s_PostWarning(pMessageListener,
367             info.lineNumber,
368             id_string,
369             err_message,
370             ILineError::eProblem_GeneralParsingError,
371             CObjReaderParseException::eFormat);
372 
373         local_copy = id_string;
374         for (auto& rit : local_copy)
375             if (rit == ',')
376                 rit = '_';
377 
378         to_parse = local_copy;
379     }
380 
381     try {
382         CSeq_id::ParseIDs(ids, to_parse, flags);
383         ids.remove_if([](CRef<CSeq_id> id_ref)
384                 { return NStr::IsBlank(id_ref->GetSeqIdString()); });
385     }
386     catch(...) {
387         ids.clear();
388     }
389 
390     if (ids.empty()) {
391         s_PostError(pMessageListener,
392                 info.lineNumber,
393                 id_string,
394                 "Could not construct seq-id from '" + id_string + "'",
395                 ILineError::eProblem_GeneralParsingError,
396                 CObjReaderParseException::eNoIDs);
397 
398         ids.push_back(Ref(new CSeq_id(CSeq_id::e_Local, id_string)));
399         return;
400     }
401     // Convert anything that looks like a GI to a local id
402     if ( info.fBaseFlags & CReaderBase::fNumericIdsAsLocal ) {
403         x_ConvertNumericToLocal(ids);
404     }
405 
406     f_id_check(ids, info, pMessageListener);
407 }
408 
409 
ParseIDs(const CTempString & s,const SDeflineParseInfo & info,const TIgnoredProblems & ignoredErrors,TIds & ids,ILineErrorListener * pMessageListener)410 bool CFastaDeflineReader::ParseIDs(
411     const CTempString& s,
412     const SDeflineParseInfo& info,
413     const TIgnoredProblems& ignoredErrors,
414     TIds& ids,
415     ILineErrorListener* pMessageListener)
416 {
417     if (s.empty()) {
418         return false;
419     }
420 
421     // if user wants all ids to be purely local, no problem
422     if( info.fBaseFlags & CReaderBase::fAllIdsAsLocal )
423     {
424         ids.push_back(Ref(new CSeq_id(CSeq_id::e_Local, s)));
425         return true;
426     }
427 
428     // be generous overall, and give raw local IDs the benefit of the
429     // doubt for now
430     CSeq_id::TParseFlags flags
431         = CSeq_id::fParse_PartialOK | CSeq_id::fParse_AnyLocal;
432     if ( info.fFastaFlags & CFastaReader::fParseRawID ) {
433         flags |= CSeq_id::fParse_RawText;
434     }
435 
436     const bool ignoreGeneralParsingError
437         = (find(ignoredErrors.cbegin(), ignoredErrors.cend(), ILineError::eProblem_GeneralParsingError)
438            != ignoredErrors.cend());
439 
440     try {
441         if (s.find(',') != NPOS && s.find('|') == NPOS)
442         {
443             string local_copy = s;
444             for (auto& ch : local_copy)
445                 if (ch == ',')
446                     ch = '_';
447 
448             CSeq_id::ParseIDs(ids, local_copy, flags);
449 
450             const string errMessage =
451                 "Near line " + NStr::NumericToString(info.lineNumber)
452                 + ", the sequence contains 'comma' symbol and replaced with 'underscore' "
453                 + "symbol. Please find and correct the sequence id.";
454 
455             if (!ignoreGeneralParsingError) {
456                 s_PostWarning(pMessageListener,
457                             info.lineNumber,
458                             s,
459                             errMessage,
460                             ILineError::eProblem_GeneralParsingError,
461                             CObjReaderParseException::eFormat);
462 
463             }
464         }
465         else
466         {
467             CSeq_id::ParseIDs(ids, s, flags);
468         }
469     } catch (CSeqIdException&) {
470         // swap(ids, old_ids);
471     }
472 
473     if ( info.fBaseFlags & CReaderBase::fNumericIdsAsLocal ) {
474         x_ConvertNumericToLocal(ids);
475     }
476 
477 
478     CFastaIdValidate idValidate(info.fFastaFlags);
479     if (info.maxIdLength) {
480         idValidate.SetMaxLocalIDLength(info.maxIdLength);
481         idValidate.SetMaxGeneralTagLength(info.maxIdLength);
482         idValidate.SetMaxAccessionLength(info.maxIdLength);
483     }
484     idValidate(ids, info.lineNumber, CIdErrorReporter(pMessageListener, ignoreGeneralParsingError));
485 
486     return true;
487 }
488 
489 
x_ConvertNumericToLocal(list<CRef<CSeq_id>> & ids)490 void CFastaDeflineReader::x_ConvertNumericToLocal(
491     list<CRef<CSeq_id>>& ids)
492 {
493     for (auto id : ids) {
494         if (id->IsGi()) {
495             const TGi gi = id->GetGi();
496             id->SetLocal().SetStr() = NStr::NumericToString(gi);
497         }
498     }
499 }
500 
501 
operator ()(const TIds & ids,const TInfo & info,ILineErrorListener * listener)502 void CSeqIdCheck::operator()(const TIds& ids,
503                              const TInfo& info,
504                              ILineErrorListener* listener)
505 {
506     if (ids.empty()) {
507         return;
508     }
509 
510     CFastaIdValidate s_IdValidate(info.fFastaFlags);
511     if (info.maxIdLength) {
512         s_IdValidate.SetMaxLocalIDLength(info.maxIdLength);
513         s_IdValidate.SetMaxGeneralTagLength(info.maxIdLength);
514         s_IdValidate.SetMaxAccessionLength(info.maxIdLength);
515     }
516     s_IdValidate(ids, info.lineNumber, CIdErrorReporter(listener));
517 }
518 
519 
GenerateID(bool unique_id)520 CRef<CSeq_id> CFastaIdHandler::GenerateID(bool unique_id)
521 {
522     return GenerateID("", unique_id);
523 }
524 
525 
GenerateID(const string & defline,const bool unique_id)526 CRef<CSeq_id> CFastaIdHandler::GenerateID(const string& defline, const bool unique_id)
527 {
528     const bool advance = true;
529     while (unique_id) {
530         auto p_Id = mp_IdGenerator->GenerateID(defline, advance);
531         auto idh = CSeq_id_Handle::GetHandle(*p_Id);
532         if (x_IsUniqueIdHandle(idh)) {
533             return p_Id;
534         }
535     }
536     // !unique_id
537     return mp_IdGenerator->GenerateID(defline, advance);
538 }
539 
540 
GenerateID(const bool advance)541 CRef<CSeq_id> CSeqIdGenerator::GenerateID(const bool advance)
542 {
543     return GenerateID("", advance);
544 }
545 
546 
GenerateID(const string & defline,const bool advance)547 CRef<CSeq_id> CSeqIdGenerator::GenerateID(const string& defline, const bool advance)
548 {
549     CRef<CSeq_id> seq_id(new CSeq_id);
550     auto n = m_Counter.load();
551     if (advance)
552         m_Counter++;
553 
554     if (m_Prefix.empty()  &&  m_Suffix.empty()) {
555         seq_id->SetLocal().SetId(n);
556     } else {
557         string& id = seq_id->SetLocal().SetStr();
558         id.reserve(128);
559         id += m_Prefix;
560         id += NStr::IntToString(n);
561         id += m_Suffix;
562     }
563     return seq_id;
564 }
565 
566 
GenerateID(void) const567 CRef<CSeq_id> CSeqIdGenerator::GenerateID(void) const
568 {
569     return const_cast<CSeqIdGenerator*>(this)->GenerateID(false);
570 }
571 
572 
573 END_SCOPE(objects)
574 END_NCBI_SCOPE
575 
576 
577