1 #ifndef OBJTOOLS_READERS___FASTA__HPP
2 #define OBJTOOLS_READERS___FASTA__HPP
3
4 /* $Id: fasta.hpp 638122 2021-09-23 19:02:14Z ivanov $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Authors: Aaron Ucko, NCBI
30 *
31 */
32
33 /// @file fasta.hpp
34 /// Interfaces for reading and scanning FASTA-format sequences.
35
36 #include <corelib/ncbi_limits.hpp>
37 #include <corelib/ncbi_param.hpp>
38 #include <util/line_reader.hpp>
39 #include <util/static_map.hpp>
40 #include <objects/seq/Bioseq.hpp>
41 #include <objects/seq/seq_id_handle.hpp>
42 #include <objects/seq/Seq_gap.hpp>
43 #include <objects/seq/Linkage_evidence.hpp>
44 #include <objects/seqloc/Seq_loc.hpp>
45 #include <objtools/readers/reader_base.hpp>
46 #include <objtools/readers/line_error.hpp>
47 #include <objtools/readers/message_listener.hpp>
48 #include <objtools/readers/source_mod_parser.hpp>
49 #include <objects/seq/Seq_data.hpp>
50 #include <objtools/readers/fasta_reader_utils.hpp>
51 #include <objtools/readers/mod_reader.hpp>
52
53 #include <stack>
54 #include <sstream>
55
56 /** @addtogroup Miscellaneous
57 *
58 * @{
59 */
60
61 BEGIN_NCBI_SCOPE
62 BEGIN_SCOPE(objects)
63
64 class CSeq_entry;
65 class CSeq_loc;
66
67 //class CSeqIdGenerator;
68
69 class ILineErrorListener;
70 class CSourceModParser;
71
72 /// Base class for reading FASTA sequences.
73 ///
74 /// This class should suffice as for typical usage, but its
75 /// implementation has been broken up into virtual functions, allowing
76 /// for a wide range of specialized subclasses.
77 ///
78 /// @sa CFastaOstream
79 class NCBI_XOBJREAD_EXPORT CFastaReader : public CReaderBase {
80 public:
81 /// Note on fAllSeqIds: some databases (notably nr) have merged
82 /// identical sequences, joining their deflines together with
83 /// control-As. Normally, the reader stops at the first
84 /// control-A; however, this flag makes it parse all the IDs.
85 enum EFlags {
86 fAssumeNuc = 1<< 0, ///< Assume nucs unless accns indicate otherwise
87 fAssumeProt = 1<< 1, ///< Assume prots unless accns indicate otherwise
88 fForceType = 1<< 2, ///< Force specified type regardless of accession
89 fNoParseID = 1<< 3, ///< Generate an ID (whole defline -> title)
90 fParseGaps = 1<< 4, ///< Make a delta sequence if gaps found
91 fOneSeq = 1<< 5, ///< Just read the first sequence found
92 fAllSeqIds NCBI_STD_DEPRECATED("This flag is no longer used in CFastaReader.") = 1<< 6, ///< Read Seq-ids past the first ^A (see note)
93 fNoSeqData = 1<< 7, ///< Parse the deflines but skip the data
94 fRequireID = 1<< 8, ///< Reject deflines that lack IDs
95 fDLOptional = 1<< 9, ///< Don't require a leading defline
96 fParseRawID = 1<<10, ///< Try to identify raw accessions
97 fSkipCheck = 1<<11, ///< Skip (rudimentary) body content check
98 fNoSplit = 1<<12, ///< Don't split out ambiguous sequence regions
99 fValidate = 1<<13, ///< Check (alphabetic) residue validity
100 fUniqueIDs = 1<<14, ///< Forbid duplicate IDs
101 fStrictGuess = 1<<15, ///< Assume no typos when guessing sequence type
102 fLaxGuess = 1<<16, ///< Use legacy heuristic for guessing seq. type
103 fAddMods = 1<<17, ///< Parse defline mods and add to SeqEntry
104 fLetterGaps = 1<<18, ///< Parse runs of Ns when splitting data
105 fNoUserObjs = 1<<19, ///< Don't save raw deflines in User-objects
106 fBadModThrow NCBI_STD_DEPRECATED("This flag is redundant as CFastaReader no longer utilizes CSourceModParser.") = 1<<20,
107 fUnknModThrow NCBI_STD_DEPRECATED("This flag is redundant as CFastaReader no longer utilizes CSourceModParser.") = 1<<21,
108 fLeaveAsText = 1<<22, ///< Don't reencode at all, just parse
109 fQuickIDCheck = 1<<23, ///< Just check local IDs' first characters
110 fUseIupacaa = 1<<24, ///< If Prot, use iupacaa instead of the default ncbieaa.
111 fHyphensIgnoreAndWarn = 1<<25, ///< When a hyphen is encountered in seq data, ignore it but warn.
112 fDisableNoResidues = 1<<26, ///< If no residues found do not raise an error
113 fDisableParseRange = 1<<27, ///< No ranges in seq-ids. Ranges part of seq-id instead.
114 fIgnoreMods = 1<<28 ///< Ignore mods entirely. Incompatible with fAddMods.
115 };
116 using TFlags = long; ///< binary OR of EFlags
117
118 using SDefLineParseInfo = CFastaDeflineReader::SDeflineParseInfo;
119 using FIdCheck = CFastaDeflineReader::FIdCheck;
120 using FModFilter = function<bool(const CTempString& mod_name)>;
121
122
123 CFastaReader(ILineReader& reader, TFlags flags = 0, FIdCheck f_idcheck = CSeqIdCheck());
124 CFastaReader(CNcbiIstream& in, TFlags flags = 0, FIdCheck f_idcheck = CSeqIdCheck());
125 CFastaReader(const string& path, TFlags flags = 0, FIdCheck f_idcheck = CSeqIdCheck());
126 CFastaReader(CReaderBase::TReaderFlags fBaseFlags,
127 TFlags flags = 0,
128 FIdCheck f_idcheck = CSeqIdCheck()
129 );
130 virtual ~CFastaReader(void);
131
132 /// CReaderBase overrides
133 virtual CRef<CSerialObject> ReadObject (ILineReader &lr, ILineErrorListener *pErrors);
134 virtual CRef<CSeq_entry> ReadSeqEntry (ILineReader &lr, ILineErrorListener *pErrors);
135
136 /// Indicates (negatively) whether there is any more input.
AtEOF(void) const137 bool AtEOF(void) const { return m_LineReader->AtEOF(); }
138
139 /// Read a single effective sequence, which may turn out to be a
140 /// segmented set.
141 virtual CRef<CSeq_entry> ReadOneSeq(ILineErrorListener * pMessageListener = 0);
142
143 /// Read multiple sequences (by default, as many as are available.)
144 CRef<CSeq_entry> ReadSet(int max_seqs = kMax_Int, ILineErrorListener * pMessageListener = 0);
145
146 /// Read as many sequences as are available, and interpret them as
147 /// an alignment, with hyphens marking relative deletions.
148 /// @param reference_row
149 /// 0-based; the special value -1 yields a full (pseudo-?)N-way alignment.
150 CRef<CSeq_entry> ReadAlignedSet(int reference_row, ILineErrorListener * pMessageListener = 0);
151
152 // also allow changing?
GetFlags(void) const153 TFlags GetFlags(void) const { return m_Flags.top(); }
154
155 typedef CRef<CSeq_loc> TMask;
156 typedef vector<TMask> TMasks;
157
158 /// Directs the *following* call to ReadOneSeq to note the locations
159 /// of lowercase letters.
160 /// @return
161 /// A smart pointer to the Seq-loc that will be populated.
162 CRef<CSeq_loc> SaveMask(void);
163
SaveMasks(TMasks * masks)164 void SaveMasks(TMasks* masks) { m_MaskVec = masks; }
165
GetIDs(void) const166 const CBioseq::TId& GetIDs(void) const { return m_CurrentSeq->GetId(); }
GetBestID(void) const167 const CSeq_id& GetBestID(void) const { return *m_BestID; }
168
GetIDGenerator(void) const169 const CSeqIdGenerator& GetIDGenerator(void) const { return m_IDHandler->GetGenerator(); }
SetIDGenerator(void)170 CSeqIdGenerator& SetIDGenerator(void) { return m_IDHandler->SetGenerator(); }
171 void SetIDGenerator(CSeqIdGenerator& gen);
172
173 /// Re-allow previously seen IDs even if fUniqueIds is on.
174 //void ResetIDTracker(void) { m_IDTracker.clear(); }
ResetIDTracker(void)175 void ResetIDTracker(void) { m_IDHandler->ClearIdCache(); }
176
177 void SetExcludedMods(const vector<string>& excluded_mods);
178
179 /// If this is set, an exception will be thrown if a Sequence ID exceeds the
180 /// given length. Overrides the id lengths specified in class CSeq_id.
181 /// @param max_len
182 /// The new maximum to set. Of course, you can set it to kMax_UI4
183 /// to effectively have no limit.
184 void SetMaxIDLength(Uint4 max_len);
185
186 /// Get the maximum ID allowed, which will be kMax_UI4
187 /// unless someone has manually lowered it.
188 ///
189 /// @returns currently set maximum ID length
GetMaxIDLength(void) const190 Uint4 GetMaxIDLength(void) const { return m_MaxIDLength; }
191
192 // Make case-sensitive and other kinds of insensitivity, too
193 // (such as "spaces" and "underscores" becoming "hyphens"
194 static string CanonicalizeString(const CTempString & sValue);
195
196 void SetMinGaps(TSeqPos gapNmin, TSeqPos gap_Unknown_length);
197
198 void SetGapLinkageEvidence(
199 CSeq_gap::EType type,
200 const set<int>& defaultEvidence,
201 const map<TSeqPos, set<int>>& countToEvidenceMap);
202
203 void SetGapLinkageEvidences(
204 CSeq_gap::EType type,
205 const set<int>& evidences);
206
207 void IgnoreProblem(ILineError::EProblem problem);
208
209 using SLineTextAndLoc = CFastaDeflineReader::SLineTextAndLoc;
210
211 using TIgnoredProblems = vector<ILineError::EProblem>;
212 using TSeqTitles = vector<SLineTextAndLoc>;
213 typedef CTempString TStr;
214
215 static void ParseDefLine(const TStr& defLine,
216 const SDefLineParseInfo& info,
217 const TIgnoredProblems& ignoredErrors,
218 list<CRef<CSeq_id>>& ids,
219 bool& hasRange,
220 TSeqPos& rangeStart,
221 TSeqPos& rangeEnd,
222 TSeqTitles& seqTitles,
223 ILineErrorListener* pMessageListener);
224
225 protected:
226 enum EInternalFlags {
227 fAligning = 0x40000000,
228 fInSegSet = 0x20000000
229 };
230
231
232 virtual void ParseDefLine (const TStr& s, ILineErrorListener * pMessageListener);
233
234 virtual void PostProcessIDs(const CBioseq::TId& defline_ids,
235 const string& defline,
236 bool has_range=false,
237 TSeqPos range_start=kInvalidSeqPos,
238 TSeqPos range_end=kInvalidSeqPos);
239
240 static size_t ParseRange (const TStr& s, TSeqPos& start, TSeqPos& end, ILineErrorListener * pMessageListener);
241 virtual void ParseTitle (const SLineTextAndLoc & lineInfo, ILineErrorListener * pMessageListener);
242 virtual bool IsValidLocalID(const TStr& s) const;
243 static bool IsValidLocalID(const TStr& idString, TFlags fFastaFlags);
244 virtual void GenerateID (void);
245 virtual void ParseDataLine (const TStr& s, ILineErrorListener * pMessageListener);
246 virtual void CheckDataLine (const TStr& s, ILineErrorListener * pMessageListener);
247 virtual void x_CloseGap (TSeqPos len, bool atStartOfLine, ILineErrorListener * pMessageListener);
248 virtual void x_OpenMask (void);
249 virtual void x_CloseMask (void);
250 virtual bool ParseGapLine (const TStr& s, ILineErrorListener * pMessageListener);
251 virtual void AssembleSeq (ILineErrorListener * pMessageListener);
252 virtual void AssignMolType (ILineErrorListener * pMessageListener);
253 virtual bool CreateWarningsForSeqDataInTitle(
254 const TStr& sLineText,
255 TSeqPos iLineNum,
256 ILineErrorListener * pMessageListener) const;
257 virtual void PostWarning(ILineErrorListener * pMessageListener,
258 EDiagSev _eSeverity, size_t _uLineNum, CTempString _MessageStrmOps,
259 CObjReaderParseException::EErrCode _eErrCode,
260 ILineError::EProblem _eProblem,
261 CTempString _sFeature, CTempString _sQualName, CTempString _sQualValue) const;
262
263 typedef int TRowNum;
264 typedef map<TRowNum, TSignedSeqPos> TSubMap;
265 // align coord -> row -> seq coord
266 typedef map<TSeqPos, TSubMap> TStartsMap;
267 typedef vector<CRef<CSeq_id> > TIds;
268
269 CRef<CSeq_entry> x_ReadSeqsToAlign(TIds& ids, ILineErrorListener * pMessageListener);
270 void x_AddPairwiseAlignments(CSeq_annot& annot, const TIds& ids,
271 TRowNum reference_row);
272 void x_AddMultiwayAlignment(CSeq_annot& annot, const TIds& ids);
273
274 // inline utilities
CloseGap(bool atStartOfLine=true,ILineErrorListener * pMessageListener=0)275 void CloseGap(bool atStartOfLine=true, ILineErrorListener * pMessageListener = 0) {
276 if (m_CurrentGapLength > 0) {
277 x_CloseGap(m_CurrentGapLength, atStartOfLine, pMessageListener);
278 m_CurrentGapLength = 0;
279 }
280 }
281 void OpenMask(void);
CloseMask(void)282 void CloseMask(void)
283 { if (m_MaskRangeStart != kInvalidSeqPos) { x_CloseMask(); } }
StreamPosition(void) const284 Int8 StreamPosition(void) const
285 { return NcbiStreamposToInt8(m_LineReader->GetPosition()); }
LineNumber(void) const286 Uint8 LineNumber(void) const
287 { return m_LineReader->GetLineNumber(); }
288
GetLineReader(void)289 ILineReader& GetLineReader(void) { return *m_LineReader; }
TestFlag(EFlags flag) const290 bool TestFlag(EFlags flag) const
291 { return (GetFlags() & flag) != 0; }
TestFlag(EInternalFlags flag) const292 bool TestFlag(EInternalFlags flag) const
293 { return (GetFlags() & flag) != 0; }
SetIDs(void)294 CBioseq::TId& SetIDs(void) { return m_CurrentSeq->SetId(); }
295
296 // NB: Not necessarily fully-formed!
SetCurrentSeq(void)297 CBioseq& SetCurrentSeq(void) { return *m_CurrentSeq; }
298
299 enum EPosType {
300 eRawPos,
301 ePosWithGaps,
302 ePosWithGapsAndSegs
303 };
304 TSeqPos GetCurrentPos(EPosType pos_type);
305
306 std::string x_NucOrProt(void) const;
307
308 private:
309 CModHandler m_ModHandler;
310
311 void x_ApplyMods(const string& title,
312 TSeqPos line_number,
313 CBioseq& bioseq,
314 ILineErrorListener* pMessageListener);
315
316 void x_SetDeflineParseInfo(SDefLineParseInfo& info);
317
318 bool m_bModifiedMaxIdLength=false;
319
320 protected:
321 struct SGap : public CObject {
322 enum EKnownSize {
323 eKnownSize_No,
324 eKnownSize_Yes
325 };
326
327 // This lets us represent a "not-set" state for CSeq_gap::EType
328 // as an unset pointer. This is needed because CSeq_gap::EType doesn't
329 // have a "not_set" enum value at this time.
330 typedef CObjectFor<CSeq_gap::EType> TGapTypeObj;
331 typedef CConstRef<TGapTypeObj> TNullableGapType;
332
333 SGap(
334 TSeqPos pos,
335 TSignedSeqPos len, // signed so we can catch negative numbers and throw an exception
336 EKnownSize eKnownSize,
337 Uint8 uLineNumber,
338 TNullableGapType pGapType = TNullableGapType(),
339 const set<CLinkage_evidence::EType>& setOfLinkageEvidence =
340 set<CLinkage_evidence::EType>());
341 // immutable once created
342
343 // 0-based, and NOT counting previous gaps
344 const TSeqPos m_uPos;
345 // 0: unknown, negative: negated nominal length of unknown gap size.
346 // positive: known gap size
347 const TSeqPos m_uLen;
348 const EKnownSize m_eKnownSize;
349 const Uint8 m_uLineNumber;
350 // NULL means "no gap type specified"
351 const TNullableGapType m_pGapType;
352 typedef set<CLinkage_evidence::EType> TLinkEvidSet;
353 const TLinkEvidSet m_setOfLinkageEvidence;
354 private:
355 // forbid assignment and copy-construction
356 SGap & operator = (const SGap & );
357 SGap(const SGap & rhs);
358 };
359
360 CRef<CSeq_align> xCreateAlignment(CRef<CSeq_id> old_id,
361 CRef<CSeq_id> new_id,
362 TSeqPos range_start,
363 TSeqPos range_end);
364
365 bool xSetSeqMol(const list<CRef<CSeq_id>>& ids, CSeq_inst_Base::EMol& mol);
366
367 typedef CRef<SGap> TGapRef;
368 typedef vector<TGapRef> TGaps;
369 typedef set<CSeq_id_Handle> TIDTracker;
370
371 CRef<ILineReader> m_LineReader;
372 stack<TFlags> m_Flags;
373 CRef<CBioseq> m_CurrentSeq;
374 TMask m_CurrentMask;
375 TMask m_NextMask;
376 TMasks * m_MaskVec;
377 CRef<CFastaIdHandler> m_IDHandler;
378 string m_SeqData;
379 TGaps m_Gaps;
380 TSeqPos m_CurrentPos; // does not count gaps
381 TSeqPos m_MaskRangeStart;
382 TSeqPos m_SegmentBase;
383 TSeqPos m_CurrentGapLength;
384 TSeqPos m_TotalGapLength;
385 TSeqPos m_gapNmin;
386 TSeqPos m_gap_Unknown_length;
387 char m_CurrentGapChar;
388 CRef<CSeq_id> m_BestID;
389 TStartsMap m_Starts;
390 TRowNum m_Row;
391 TSeqPos m_Offset;
392 TIDTracker m_IDTracker;
393 CSourceModParser::TMods m_BadMods;
394 CSourceModParser::TMods m_UnusedMods;
395 Uint4 m_MaxIDLength;
396
397
398
399 using TCountToLinkEvidMap = map<TSeqPos, SGap::TLinkEvidSet>;
400 TCountToLinkEvidMap m_GapsizeToLinkageEvidence;
401 private:
402 SGap::TLinkEvidSet m_DefaultLinkageEvidence;
403 protected:
404 SGap::TNullableGapType m_gap_type;
405
406 TSeqTitles m_CurrentSeqTitles;
407 std::vector<ILineError::EProblem> m_ignorable;
408 FIdCheck m_fIdCheck;
409 FModFilter m_fModFilter = nullptr;
410 };
411
412
413 typedef CFastaReader::TFlags TReadFastaFlags;
414
415
416 /// A const-correct replacement for the deprecated ReadFasta function
417 /// @sa CFastaReader
418 NCBI_XOBJREAD_EXPORT
419 CRef<CSeq_entry> ReadFasta(CNcbiIstream& in, CFastaReader::TFlags flags=0,
420 int* counter = nullptr,
421 CFastaReader::TMasks* lcv=nullptr,
422 ILineErrorListener* pMessageListener=nullptr);
423
424 //////////////////////////////////////////////////////////////////
425 //
426 // Class - description of multi-entry FASTA file,
427 // to keep list of offsets on all molecules in the file.
428 //
429 struct SFastaFileMap
430 {
431 struct SFastaEntry
432 {
SFastaEntrySFastaFileMap::SFastaEntry433 SFastaEntry() : stream_offset(0) {}
434 /// List of qll sequence ids
435 typedef list<string> TFastaSeqIds;
436
437 string seq_id; ///< Primary sequence Id
438 string description; ///< Molecule description
439 CNcbiStreampos stream_offset; ///< Molecule offset in file
440 TFastaSeqIds all_seq_ids; ///< List of all seq.ids
441 };
442
443 typedef vector<SFastaEntry> TMapVector;
444
445 TMapVector file_map; // vector keeps list of all molecule entries
446 };
447
448 /// Callback interface to scan fasta file for entries
449 class NCBI_XOBJREAD_EXPORT IFastaEntryScan
450 {
451 public:
452 virtual ~IFastaEntryScan();
453
454 /// Callback function, called after reading the fasta entry
455 virtual void EntryFound(CRef<CSeq_entry> se,
456 CNcbiStreampos stream_position) = 0;
457 };
458
459
460 /// Function reads input stream (assumed that it is FASTA format) one
461 /// molecule entry after another filling the map structure describing and
462 /// pointing on molecule entries. Fasta map can be used later for quick
463 /// CSeq_entry retrival
464 void NCBI_XOBJREAD_EXPORT ReadFastaFileMap(SFastaFileMap* fasta_map,
465 CNcbiIfstream& input);
466
467 /// Scan FASTA files, call IFastaEntryScan::EntryFound (payload function)
468 void NCBI_XOBJREAD_EXPORT ScanFastaFile(IFastaEntryScan* scanner,
469 CNcbiIfstream& input,
470 CFastaReader::TFlags fread_flags);
471
472
473 /////////////////// CFastaReader inline methods
474
475
476 inline
OpenMask()477 void CFastaReader::OpenMask()
478 {
479 if (m_MaskRangeStart == kInvalidSeqPos && m_CurrentMask.NotEmpty()) {
480 x_OpenMask();
481 }
482 }
483
484 inline
GetCurrentPos(EPosType pos_type)485 TSeqPos CFastaReader::GetCurrentPos(EPosType pos_type)
486 {
487 TSeqPos pos = m_CurrentPos;
488 switch (pos_type) {
489 case ePosWithGapsAndSegs:
490 pos += m_SegmentBase;
491 // FALL THROUGH!!
492 case ePosWithGaps:
493 pos += m_TotalGapLength;
494 // FALL THROUGH!!
495 case eRawPos:
496 return pos;
497 default:
498 return kInvalidSeqPos;
499 }
500 }
501
502 END_SCOPE(objects)
503 END_NCBI_SCOPE
504
505 /* @} */
506
507 #endif /* OBJTOOLS_READERS___FASTA__HPP */
508