1 /* $Id: fasta_reader_utils.cpp 634226 2021-07-06 20:04:10Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Justin Foley
27 *
28 * File Description:
29 * Reader for FASTA-format definition lines. Based on code
30 * originally contained in CFastaReader.
31 *
32 * ===========================================================================
33 */
34
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbidiag.hpp>
37 #include <objtools/error_codes.hpp>
38 #include <objtools/readers/fasta.hpp>
39 #include <objects/general/Dbtag.hpp>
40 #include <objects/general/Object_id.hpp>
41 #include <objtools/readers/fasta_reader_utils.hpp>
42 #include <objtools/readers/seqid_validate.hpp>
43
44 #define NCBI_USE_ERRCODE_X Objtools_Rd_Fasta // Will need to change this
45
46 BEGIN_NCBI_SCOPE
47 BEGIN_SCOPE(objects)
48
49 size_t CFastaDeflineReader::s_MaxLocalIDLength = CSeq_id::kMaxLocalIDLength;
50 size_t CFastaDeflineReader::s_MaxGeneralTagLength = CSeq_id::kMaxGeneralTagLength;
51 size_t CFastaDeflineReader::s_MaxAccessionLength = CSeq_id::kMaxAccessionLength;
52
53
54
s_PostError(ILineErrorListener * pMessageListener,const TSeqPos lineNumber,const string & idString,const string & errMessage,const CObjReaderLineException::EProblem problem,const CObjReaderParseException::EErrCode errCode)55 static void s_PostError(ILineErrorListener* pMessageListener,
56 const TSeqPos lineNumber,
57 const string& idString,
58 const string& errMessage,
59 const CObjReaderLineException::EProblem problem,
60 const CObjReaderParseException::EErrCode errCode)
61 {
62 if (pMessageListener) {
63 unique_ptr<CObjReaderLineException> pLineExpt(
64 CObjReaderLineException::Create(
65 eDiag_Error,
66 lineNumber,
67 errMessage,
68 problem,
69 idString, "", "", "",
70 errCode));
71
72 if (pMessageListener->PutError(*pLineExpt)) {
73 return;
74 }
75 }
76
77 throw CObjReaderParseException(DIAG_COMPILE_INFO,
78 0,
79 errCode,
80 errMessage,
81 lineNumber,
82 eDiag_Error);
83 }
84
s_PostWarning(ILineErrorListener * pMessageListener,const TSeqPos lineNumber,const string & idString,const string & errMessage,const CObjReaderLineException::EProblem problem,const CObjReaderParseException::EErrCode errCode)85 static void s_PostWarning(ILineErrorListener* pMessageListener,
86 const TSeqPos lineNumber,
87 const string& idString,
88 const string& errMessage,
89 const CObjReaderLineException::EProblem problem,
90 const CObjReaderParseException::EErrCode errCode)
91 {
92 unique_ptr<CObjReaderLineException> pLineExpt(
93 CObjReaderLineException::Create(
94 eDiag_Warning,
95 lineNumber,
96 errMessage,
97 problem,
98 idString, "", "", "",
99 errCode));
100
101 if (!pMessageListener) {
102 LOG_POST_X(1, Warning << pLineExpt->Message());
103 return;
104 }
105
106 if (!pMessageListener->PutError(*pLineExpt)) {
107 throw CObjReaderParseException(DIAG_COMPILE_INFO,
108 0,
109 errCode,
110 errMessage,
111 lineNumber,
112 eDiag_Warning);
113 }
114 }
115
116 // For reasons of efficiency, this method does not use CRef<CSeq_interval> to access range
117 // information - RW-26
ParseDefline(const CTempString & defline,const SDeflineParseInfo & info,const TIgnoredProblems & ignored_errors,TIds & ids,bool & has_range,TSeqPos & range_start,TSeqPos & range_end,TSeqTitles & titles,ILineErrorListener * pMessageListener)118 void CFastaDeflineReader::ParseDefline(const CTempString& defline,
119 const SDeflineParseInfo& info,
120 const TIgnoredProblems& ignored_errors,
121 TIds& ids,
122 bool& has_range,
123 TSeqPos& range_start,
124 TSeqPos& range_end,
125 TSeqTitles& titles,
126 ILineErrorListener* pMessageListener)
127 {
128 SDeflineData data;
129 ParseDefline(defline, info, data, pMessageListener);
130 has_range = data.has_range;
131 range_start = data.range_start;
132 range_end = data.range_end;
133 titles = move(data.titles);
134 }
135
ParseDefline(const CTempString & defline,const SDeflineParseInfo & info,SDeflineData & data,ILineErrorListener * pMessageListener)136 void CFastaDeflineReader::ParseDefline(const CTempString& defline,
137 const SDeflineParseInfo& info,
138 SDeflineData& data,
139 ILineErrorListener* pMessageListener)
140 {
141 static CSeqIdCheck fn_idcheck;
142 ParseDefline(defline, info, data, pMessageListener, fn_idcheck);
143 }
144
145
ParseDefline(const CTempString & defline,const SDeflineParseInfo & info,SDeflineData & data,ILineErrorListener * pMessageListener,FIdCheck fn_idcheck)146 void CFastaDeflineReader::ParseDefline(const CTempString& defline,
147 const SDeflineParseInfo& info,
148 SDeflineData& data,
149 ILineErrorListener* pMessageListener,
150 FIdCheck fn_idcheck)
151 {
152 size_t range_len = 0;
153 const TFastaFlags& fFastaFlags = info.fFastaFlags;
154 const TSeqPos& lineNumber = info.lineNumber;
155 data.has_range = false;
156
157 const size_t len = defline.length();
158 if (len <= 1 ||
159 NStr::IsBlank(defline.substr(1))) {
160 return;
161 }
162
163 if (defline[0] != '>') {
164 NCBI_THROW2(CObjReaderParseException, eFormat,
165 "Invalid defline. First character is not '>'", 0);
166 }
167
168 // ignore spaces between '>' and the sequence ID
169 size_t start;
170 for(start = 1 ; start < len; ++start ) {
171 if( ! isspace(defline[start]) ) {
172 break;
173 }
174 }
175
176 size_t pos;
177 size_t title_start = NPOS;
178 if ((fFastaFlags & CFastaReader::fNoParseID)) {
179 title_start = start;
180 }
181 else
182 {
183 pos = start;
184 while (pos < len && defline[pos] > ' ') {
185 pos++;
186 }
187
188 if ( ! (fFastaFlags & CFastaReader::fDisableParseRange) ) {
189 range_len = ParseRange(defline.substr(start, pos - start),
190 data.range_start, data.range_end, pMessageListener);
191 }
192
193 auto id_string = defline.substr(start, pos - start - range_len);
194 if (NStr::IsBlank(id_string)) {
195 NCBI_THROW2(CObjReaderParseException, eFormat,
196 "Unable to locate sequence id in definition line", 0);
197 }
198
199 title_start = pos;
200 x_ProcessIDs(id_string,
201 info,
202 data.ids,
203 pMessageListener,
204 fn_idcheck);
205
206 data.has_range = (range_len>0);
207 }
208
209 // trim leading whitespace from title (is this appropriate?)
210 while (title_start < len
211 && isspace((unsigned char)defline[title_start])) {
212 ++title_start;
213 }
214
215 if (title_start < len) {
216 for (pos = title_start + 1; pos < len; ++pos) {
217 if ((unsigned char)defline[pos] < ' ') {
218 break;
219 }
220 }
221 // Parse the title elsewhere - after the molecule has been deduced
222 data.titles.push_back(
223 SLineTextAndLoc(
224 defline.substr(title_start, pos - title_start), lineNumber));
225 }
226 }
227
228
ParseRange(const CTempString & s,TSeqPos & start,TSeqPos & end,ILineErrorListener * pMessageListener)229 TSeqPos CFastaDeflineReader::ParseRange(
230 const CTempString& s,
231 TSeqPos& start,
232 TSeqPos& end,
233 ILineErrorListener * pMessageListener)
234 {
235
236 if (s.empty()) {
237 return 0;
238 }
239
240 bool on_start = false;
241 bool negative = false;
242 TSeqPos mult = 1;
243 size_t pos;
244 start = end = 0;
245 for (pos = s.length() - 1; pos > 0; --pos) {
246 unsigned char c = s[pos];
247 if (c >= '0' && c <= '9') {
248 if (on_start) {
249 start += (c - '0') * mult;
250 } else {
251 end += (c - '0') * mult;
252 }
253 mult *= 10;
254 } else if (c == '-' && !on_start && mult > 1) {
255 on_start = true;
256 mult = 1;
257 } else if (c == ':' && on_start && mult > 1) {
258 break;
259 } else if (c == 'c' && pos > 0 && s[--pos] == ':'
260 && on_start && mult > 1) {
261 negative = true;
262 break;
263 } else {
264 return 0; // syntax error
265 }
266 }
267 if ((negative ? (end > start) : (start > end)) || s[pos] != ':') {
268 return 0;
269 }
270 --start;
271 --end;
272 return TSeqPos(s.length() - pos);
273 }
274
275
276 class CIdErrorReporter
277 {
278 public:
279 CIdErrorReporter(ILineErrorListener* pMessageListener, bool ignoreGeneralParsingError=false);
280
281 void operator() (EDiagSev severity,
282 int lineNum,
283 const string& idString,
284 CFastaIdValidate::EErrCode errCode,
285 const string& msg);
286 private:
287 using TCodePair = pair<CObjReaderLineException::EProblem, CObjReaderParseException::EErrCode>;
288 ILineErrorListener* m_pMessageListener = nullptr;
289 bool m_IgnoreGeneralParsingError=false;
290 };
291
292
293
CIdErrorReporter(ILineErrorListener * pMessageListener,bool ignoreGeneralParsingError)294 CIdErrorReporter::CIdErrorReporter(ILineErrorListener* pMessageListener, bool ignoreGeneralParsingError) :
295 m_pMessageListener(pMessageListener), m_IgnoreGeneralParsingError(ignoreGeneralParsingError) {}
296
297
operator ()(EDiagSev severity,int lineNum,const string & idString,CFastaIdValidate::EErrCode errCode,const string & msg)298 void CIdErrorReporter::operator()(EDiagSev severity,
299 int lineNum,
300 const string& idString,
301 CFastaIdValidate::EErrCode errCode,
302 const string& msg)
303 {
304
305 static map<CFastaIdValidate::EErrCode,TCodePair> s_CodeMap = /* replace with compile-time map */
306 {
307 {CFastaIdValidate::eIDTooLong,{ILineError::eProblem_GeneralParsingError, CObjReaderParseException::eIDTooLong}},
308 {CFastaIdValidate::eBadLocalID,{ILineError::eProblem_GeneralParsingError, CObjReaderParseException::eInvalidID}},
309 {CFastaIdValidate::eUnexpectedNucResidues,{ILineError::eProblem_UnexpectedNucResidues, CObjReaderParseException::eFormat}},
310 {CFastaIdValidate::eUnexpectedAminoAcids,{ILineError::eProblem_UnexpectedAminoAcids, CObjReaderParseException::eFormat}}
311 };
312
313
314 const auto cit = s_CodeMap.find(errCode);
315 _ASSERT(cit != s_CodeMap.end()); // convert this to a compile-time assertion
316
317 const auto& problem = cit->second.first;
318 if (m_IgnoreGeneralParsingError &&
319 problem == ILineError::eProblem_GeneralParsingError) {
320 return;
321 }
322
323 const auto& parseExceptionCode = cit->second.second;
324 if (severity == eDiag_Error) {
325 s_PostError(m_pMessageListener, lineNum, idString, msg, problem, parseExceptionCode);
326 }
327 else {
328 s_PostWarning(m_pMessageListener, lineNum, idString, msg, problem, parseExceptionCode);
329 }
330 }
331
332
x_ProcessIDs(const CTempString & id_string,const SDeflineParseInfo & info,TIds & ids,ILineErrorListener * pMessageListener,FIdCheck f_id_check)333 void CFastaDeflineReader::x_ProcessIDs(
334 const CTempString& id_string,
335 const SDeflineParseInfo& info,
336 TIds& ids,
337 ILineErrorListener* pMessageListener,
338 FIdCheck f_id_check
339 )
340 {
341 if (info.fBaseFlags & CReaderBase::fAllIdsAsLocal)
342 {
343 CRef<CSeq_id> pSeqId(new CSeq_id(CSeq_id::e_Local, id_string));
344 ids.push_back(pSeqId);
345 f_id_check(ids, info, pMessageListener);
346 return;
347 }
348
349 CSeq_id::TParseFlags flags =
350 CSeq_id::fParse_PartialOK |
351 CSeq_id::fParse_AnyLocal;
352
353 if (info.fFastaFlags & CFastaReader::fParseRawID) {
354 flags |= CSeq_id::fParse_RawText;
355 }
356
357 string local_copy;
358 auto to_parse = id_string;
359 if (id_string.find(',') != NPOS &&
360 id_string.find('|') == NPOS) {
361 const string err_message =
362 "Near line " + NStr::NumericToString(info.lineNumber)
363 + ", the sequence id string contains 'comma' symbol, which has been replaced with 'underscore' "
364 + "symbol. Please correct the sequence id string.";
365
366 s_PostWarning(pMessageListener,
367 info.lineNumber,
368 id_string,
369 err_message,
370 ILineError::eProblem_GeneralParsingError,
371 CObjReaderParseException::eFormat);
372
373 local_copy = id_string;
374 for (auto& rit : local_copy)
375 if (rit == ',')
376 rit = '_';
377
378 to_parse = local_copy;
379 }
380
381 try {
382 CSeq_id::ParseIDs(ids, to_parse, flags);
383 ids.remove_if([](CRef<CSeq_id> id_ref)
384 { return NStr::IsBlank(id_ref->GetSeqIdString()); });
385 }
386 catch(...) {
387 ids.clear();
388 }
389
390 if (ids.empty()) {
391 s_PostError(pMessageListener,
392 info.lineNumber,
393 id_string,
394 "Could not construct seq-id from '" + id_string + "'",
395 ILineError::eProblem_GeneralParsingError,
396 CObjReaderParseException::eNoIDs);
397
398 ids.push_back(Ref(new CSeq_id(CSeq_id::e_Local, id_string)));
399 return;
400 }
401 // Convert anything that looks like a GI to a local id
402 if ( info.fBaseFlags & CReaderBase::fNumericIdsAsLocal ) {
403 x_ConvertNumericToLocal(ids);
404 }
405
406 f_id_check(ids, info, pMessageListener);
407 }
408
409
ParseIDs(const CTempString & s,const SDeflineParseInfo & info,const TIgnoredProblems & ignoredErrors,TIds & ids,ILineErrorListener * pMessageListener)410 bool CFastaDeflineReader::ParseIDs(
411 const CTempString& s,
412 const SDeflineParseInfo& info,
413 const TIgnoredProblems& ignoredErrors,
414 TIds& ids,
415 ILineErrorListener* pMessageListener)
416 {
417 if (s.empty()) {
418 return false;
419 }
420
421 // if user wants all ids to be purely local, no problem
422 if( info.fBaseFlags & CReaderBase::fAllIdsAsLocal )
423 {
424 ids.push_back(Ref(new CSeq_id(CSeq_id::e_Local, s)));
425 return true;
426 }
427
428 // be generous overall, and give raw local IDs the benefit of the
429 // doubt for now
430 CSeq_id::TParseFlags flags
431 = CSeq_id::fParse_PartialOK | CSeq_id::fParse_AnyLocal;
432 if ( info.fFastaFlags & CFastaReader::fParseRawID ) {
433 flags |= CSeq_id::fParse_RawText;
434 }
435
436 const bool ignoreGeneralParsingError
437 = (find(ignoredErrors.cbegin(), ignoredErrors.cend(), ILineError::eProblem_GeneralParsingError)
438 != ignoredErrors.cend());
439
440 try {
441 if (s.find(',') != NPOS && s.find('|') == NPOS)
442 {
443 string local_copy = s;
444 for (auto& ch : local_copy)
445 if (ch == ',')
446 ch = '_';
447
448 CSeq_id::ParseIDs(ids, local_copy, flags);
449
450 const string errMessage =
451 "Near line " + NStr::NumericToString(info.lineNumber)
452 + ", the sequence contains 'comma' symbol and replaced with 'underscore' "
453 + "symbol. Please find and correct the sequence id.";
454
455 if (!ignoreGeneralParsingError) {
456 s_PostWarning(pMessageListener,
457 info.lineNumber,
458 s,
459 errMessage,
460 ILineError::eProblem_GeneralParsingError,
461 CObjReaderParseException::eFormat);
462
463 }
464 }
465 else
466 {
467 CSeq_id::ParseIDs(ids, s, flags);
468 }
469 } catch (CSeqIdException&) {
470 // swap(ids, old_ids);
471 }
472
473 if ( info.fBaseFlags & CReaderBase::fNumericIdsAsLocal ) {
474 x_ConvertNumericToLocal(ids);
475 }
476
477
478 CFastaIdValidate idValidate(info.fFastaFlags);
479 if (info.maxIdLength) {
480 idValidate.SetMaxLocalIDLength(info.maxIdLength);
481 idValidate.SetMaxGeneralTagLength(info.maxIdLength);
482 idValidate.SetMaxAccessionLength(info.maxIdLength);
483 }
484 idValidate(ids, info.lineNumber, CIdErrorReporter(pMessageListener, ignoreGeneralParsingError));
485
486 return true;
487 }
488
489
x_ConvertNumericToLocal(list<CRef<CSeq_id>> & ids)490 void CFastaDeflineReader::x_ConvertNumericToLocal(
491 list<CRef<CSeq_id>>& ids)
492 {
493 for (auto id : ids) {
494 if (id->IsGi()) {
495 const TGi gi = id->GetGi();
496 id->SetLocal().SetStr() = NStr::NumericToString(gi);
497 }
498 }
499 }
500
501
operator ()(const TIds & ids,const TInfo & info,ILineErrorListener * listener)502 void CSeqIdCheck::operator()(const TIds& ids,
503 const TInfo& info,
504 ILineErrorListener* listener)
505 {
506 if (ids.empty()) {
507 return;
508 }
509
510 CFastaIdValidate s_IdValidate(info.fFastaFlags);
511 if (info.maxIdLength) {
512 s_IdValidate.SetMaxLocalIDLength(info.maxIdLength);
513 s_IdValidate.SetMaxGeneralTagLength(info.maxIdLength);
514 s_IdValidate.SetMaxAccessionLength(info.maxIdLength);
515 }
516 s_IdValidate(ids, info.lineNumber, CIdErrorReporter(listener));
517 }
518
519
GenerateID(bool unique_id)520 CRef<CSeq_id> CFastaIdHandler::GenerateID(bool unique_id)
521 {
522 return GenerateID("", unique_id);
523 }
524
525
GenerateID(const string & defline,const bool unique_id)526 CRef<CSeq_id> CFastaIdHandler::GenerateID(const string& defline, const bool unique_id)
527 {
528 const bool advance = true;
529 while (unique_id) {
530 auto p_Id = mp_IdGenerator->GenerateID(defline, advance);
531 auto idh = CSeq_id_Handle::GetHandle(*p_Id);
532 if (x_IsUniqueIdHandle(idh)) {
533 return p_Id;
534 }
535 }
536 // !unique_id
537 return mp_IdGenerator->GenerateID(defline, advance);
538 }
539
540
GenerateID(const bool advance)541 CRef<CSeq_id> CSeqIdGenerator::GenerateID(const bool advance)
542 {
543 return GenerateID("", advance);
544 }
545
546
GenerateID(const string & defline,const bool advance)547 CRef<CSeq_id> CSeqIdGenerator::GenerateID(const string& defline, const bool advance)
548 {
549 CRef<CSeq_id> seq_id(new CSeq_id);
550 auto n = m_Counter.load();
551 if (advance)
552 m_Counter++;
553
554 if (m_Prefix.empty() && m_Suffix.empty()) {
555 seq_id->SetLocal().SetId(n);
556 } else {
557 string& id = seq_id->SetLocal().SetStr();
558 id.reserve(128);
559 id += m_Prefix;
560 id += NStr::IntToString(n);
561 id += m_Suffix;
562 }
563 return seq_id;
564 }
565
566
GenerateID(void) const567 CRef<CSeq_id> CSeqIdGenerator::GenerateID(void) const
568 {
569 return const_cast<CSeqIdGenerator*>(this)->GenerateID(false);
570 }
571
572
573 END_SCOPE(objects)
574 END_NCBI_SCOPE
575
576
577