1 #ifndef OBJTOOLS_READERS___FASTA__HPP
2 #define OBJTOOLS_READERS___FASTA__HPP
3 
4 /*  $Id: fasta.hpp 638122 2021-09-23 19:02:14Z ivanov $
5 * ===========================================================================
6 *
7 *                            PUBLIC DOMAIN NOTICE
8 *               National Center for Biotechnology Information
9 *
10 *  This software/database is a "United States Government Work" under the
11 *  terms of the United States Copyright Act.  It was written as part of
12 *  the author's official duties as a United States Government employee and
13 *  thus cannot be copyrighted.  This software/database is freely available
14 *  to the public for use. The National Library of Medicine and the U.S.
15 *  Government have not placed any restriction on its use or reproduction.
16 *
17 *  Although all reasonable efforts have been taken to ensure the accuracy
18 *  and reliability of the software and data, the NLM and the U.S.
19 *  Government do not and cannot warrant the performance or results that
20 *  may be obtained by using this software or data. The NLM and the U.S.
21 *  Government disclaim all warranties, express or implied, including
22 *  warranties of performance, merchantability or fitness for any particular
23 *  purpose.
24 *
25 *  Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Authors:  Aaron Ucko, NCBI
30 *
31 */
32 
33 /// @file fasta.hpp
34 /// Interfaces for reading and scanning FASTA-format sequences.
35 
36 #include <corelib/ncbi_limits.hpp>
37 #include <corelib/ncbi_param.hpp>
38 #include <util/line_reader.hpp>
39 #include <util/static_map.hpp>
40 #include <objects/seq/Bioseq.hpp>
41 #include <objects/seq/seq_id_handle.hpp>
42 #include <objects/seq/Seq_gap.hpp>
43 #include <objects/seq/Linkage_evidence.hpp>
44 #include <objects/seqloc/Seq_loc.hpp>
45 #include <objtools/readers/reader_base.hpp>
46 #include <objtools/readers/line_error.hpp>
47 #include <objtools/readers/message_listener.hpp>
48 #include <objtools/readers/source_mod_parser.hpp>
49 #include <objects/seq/Seq_data.hpp>
50 #include <objtools/readers/fasta_reader_utils.hpp>
51 #include <objtools/readers/mod_reader.hpp>
52 
53 #include <stack>
54 #include <sstream>
55 
56 /** @addtogroup Miscellaneous
57  *
58  * @{
59  */
60 
61 BEGIN_NCBI_SCOPE
62 BEGIN_SCOPE(objects)
63 
64 class CSeq_entry;
65 class CSeq_loc;
66 
67 //class CSeqIdGenerator;
68 
69 class ILineErrorListener;
70 class CSourceModParser;
71 
72 /// Base class for reading FASTA sequences.
73 ///
74 /// This class should suffice as for typical usage, but its
75 /// implementation has been broken up into virtual functions, allowing
76 /// for a wide range of specialized subclasses.
77 ///
78 /// @sa CFastaOstream
79 class NCBI_XOBJREAD_EXPORT CFastaReader : public CReaderBase {
80 public:
81     /// Note on fAllSeqIds: some databases (notably nr) have merged
82     /// identical sequences, joining their deflines together with
83     /// control-As.  Normally, the reader stops at the first
84     /// control-A; however, this flag makes it parse all the IDs.
85     enum EFlags {
86         fAssumeNuc            = 1<< 0, ///< Assume nucs unless accns indicate otherwise
87         fAssumeProt           = 1<< 1, ///< Assume prots unless accns indicate otherwise
88         fForceType            = 1<< 2, ///< Force specified type regardless of accession
89         fNoParseID            = 1<< 3, ///< Generate an ID (whole defline -&gt; title)
90         fParseGaps            = 1<< 4, ///< Make a delta sequence if gaps found
91         fOneSeq               = 1<< 5, ///< Just read the first sequence found
92         fAllSeqIds NCBI_STD_DEPRECATED("This flag is no longer used in CFastaReader.")   = 1<< 6, ///< Read Seq-ids past the first ^A (see note)
93         fNoSeqData            = 1<< 7, ///< Parse the deflines but skip the data
94         fRequireID            = 1<< 8, ///< Reject deflines that lack IDs
95         fDLOptional           = 1<< 9, ///< Don't require a leading defline
96         fParseRawID           = 1<<10, ///< Try to identify raw accessions
97         fSkipCheck            = 1<<11, ///< Skip (rudimentary) body content check
98         fNoSplit              = 1<<12, ///< Don't split out ambiguous sequence regions
99         fValidate             = 1<<13, ///< Check (alphabetic) residue validity
100         fUniqueIDs            = 1<<14, ///< Forbid duplicate IDs
101         fStrictGuess          = 1<<15, ///< Assume no typos when guessing sequence type
102         fLaxGuess             = 1<<16, ///< Use legacy heuristic for guessing seq. type
103         fAddMods              = 1<<17, ///< Parse defline mods and add to SeqEntry
104         fLetterGaps           = 1<<18, ///< Parse runs of Ns when splitting data
105         fNoUserObjs           = 1<<19, ///< Don't save raw deflines in User-objects
106         fBadModThrow  NCBI_STD_DEPRECATED("This flag is redundant as CFastaReader no longer utilizes CSourceModParser.")    = 1<<20,
107         fUnknModThrow NCBI_STD_DEPRECATED("This flag is redundant as CFastaReader no longer utilizes CSourceModParser.")    = 1<<21,
108         fLeaveAsText          = 1<<22, ///< Don't reencode at all, just parse
109         fQuickIDCheck         = 1<<23, ///< Just check local IDs' first characters
110         fUseIupacaa           = 1<<24, ///< If Prot, use iupacaa instead of the default ncbieaa.
111         fHyphensIgnoreAndWarn = 1<<25, ///< When a hyphen is encountered in seq data, ignore it but warn.
112         fDisableNoResidues    = 1<<26, ///< If no residues found do not raise an error
113         fDisableParseRange    = 1<<27, ///< No ranges in seq-ids.  Ranges part of seq-id instead.
114         fIgnoreMods           = 1<<28  ///< Ignore mods entirely. Incompatible with fAddMods.
115     };
116     using TFlags = long; ///< binary OR of EFlags
117 
118     using SDefLineParseInfo = CFastaDeflineReader::SDeflineParseInfo;
119     using FIdCheck = CFastaDeflineReader::FIdCheck;
120     using FModFilter = function<bool(const CTempString& mod_name)>;
121 
122 
123     CFastaReader(ILineReader& reader, TFlags flags = 0, FIdCheck f_idcheck = CSeqIdCheck());
124     CFastaReader(CNcbiIstream& in,    TFlags flags = 0, FIdCheck f_idcheck = CSeqIdCheck());
125     CFastaReader(const string& path,  TFlags flags = 0, FIdCheck f_idcheck = CSeqIdCheck());
126     CFastaReader(CReaderBase::TReaderFlags fBaseFlags,
127                  TFlags flags = 0,
128                  FIdCheck f_idcheck = CSeqIdCheck()
129                  );
130     virtual ~CFastaReader(void);
131 
132     /// CReaderBase overrides
133     virtual CRef<CSerialObject> ReadObject   (ILineReader &lr, ILineErrorListener *pErrors);
134     virtual CRef<CSeq_entry>    ReadSeqEntry (ILineReader &lr, ILineErrorListener *pErrors);
135 
136     /// Indicates (negatively) whether there is any more input.
AtEOF(void) const137     bool AtEOF(void) const { return m_LineReader->AtEOF(); }
138 
139     /// Read a single effective sequence, which may turn out to be a
140     /// segmented set.
141     virtual CRef<CSeq_entry> ReadOneSeq(ILineErrorListener * pMessageListener = 0);
142 
143     /// Read multiple sequences (by default, as many as are available.)
144     CRef<CSeq_entry> ReadSet(int max_seqs = kMax_Int, ILineErrorListener * pMessageListener = 0);
145 
146     /// Read as many sequences as are available, and interpret them as
147     /// an alignment, with hyphens marking relative deletions.
148     /// @param reference_row
149     ///   0-based; the special value -1 yields a full (pseudo-?)N-way alignment.
150     CRef<CSeq_entry> ReadAlignedSet(int reference_row, ILineErrorListener * pMessageListener = 0);
151 
152     // also allow changing?
GetFlags(void) const153     TFlags GetFlags(void) const { return m_Flags.top(); }
154 
155     typedef CRef<CSeq_loc> TMask;
156     typedef vector<TMask>  TMasks;
157 
158     /// Directs the *following* call to ReadOneSeq to note the locations
159     /// of lowercase letters.
160     /// @return
161     ///     A smart pointer to the Seq-loc that will be populated.
162     CRef<CSeq_loc> SaveMask(void);
163 
SaveMasks(TMasks * masks)164     void SaveMasks(TMasks* masks) { m_MaskVec = masks; }
165 
GetIDs(void) const166     const CBioseq::TId& GetIDs(void) const    { return m_CurrentSeq->GetId(); }
GetBestID(void) const167     const CSeq_id&      GetBestID(void) const { return *m_BestID; }
168 
GetIDGenerator(void) const169     const CSeqIdGenerator& GetIDGenerator(void) const { return  m_IDHandler->GetGenerator(); }
SetIDGenerator(void)170     CSeqIdGenerator&       SetIDGenerator(void)       { return  m_IDHandler->SetGenerator(); }
171     void                   SetIDGenerator(CSeqIdGenerator& gen);
172 
173     /// Re-allow previously seen IDs even if fUniqueIds is on.
174     //void ResetIDTracker(void) { m_IDTracker.clear(); }
ResetIDTracker(void)175     void ResetIDTracker(void) { m_IDHandler->ClearIdCache(); }
176 
177     void SetExcludedMods(const vector<string>& excluded_mods);
178 
179     /// If this is set, an exception will be thrown if a Sequence ID exceeds the
180     /// given length. Overrides the id lengths specified in class CSeq_id.
181     /// @param max_len
182     ///   The new maximum to set.  Of course, you can set it to kMax_UI4
183     ///   to effectively have no limit.
184     void SetMaxIDLength(Uint4 max_len);
185 
186     /// Get the maximum ID allowed, which will be kMax_UI4
187     /// unless someone has manually lowered it.
188     ///
189     /// @returns currently set maximum ID length
GetMaxIDLength(void) const190     Uint4 GetMaxIDLength(void) const { return m_MaxIDLength; }
191 
192     // Make case-sensitive and other kinds of insensitivity, too
193     // (such as "spaces" and "underscores" becoming "hyphens"
194     static string CanonicalizeString(const CTempString & sValue);
195 
196     void SetMinGaps(TSeqPos gapNmin, TSeqPos gap_Unknown_length);
197 
198     void SetGapLinkageEvidence(
199             CSeq_gap::EType type,
200             const set<int>& defaultEvidence,
201             const map<TSeqPos, set<int>>& countToEvidenceMap);
202 
203     void SetGapLinkageEvidences(
204         CSeq_gap::EType type,
205         const set<int>& evidences);
206 
207     void IgnoreProblem(ILineError::EProblem problem);
208 
209     using SLineTextAndLoc = CFastaDeflineReader::SLineTextAndLoc;
210 
211     using TIgnoredProblems = vector<ILineError::EProblem>;
212     using TSeqTitles = vector<SLineTextAndLoc>;
213     typedef CTempString TStr;
214 
215     static void ParseDefLine(const TStr& defLine,
216         const SDefLineParseInfo& info,
217         const TIgnoredProblems& ignoredErrors,
218         list<CRef<CSeq_id>>& ids,
219         bool& hasRange,
220         TSeqPos& rangeStart,
221         TSeqPos& rangeEnd,
222         TSeqTitles& seqTitles,
223         ILineErrorListener* pMessageListener);
224 
225 protected:
226     enum EInternalFlags {
227         fAligning = 0x40000000,
228         fInSegSet = 0x20000000
229     };
230 
231 
232     virtual void ParseDefLine (const TStr& s, ILineErrorListener * pMessageListener);
233 
234     virtual void PostProcessIDs(const CBioseq::TId& defline_ids,
235         const string& defline,
236         bool has_range=false,
237         TSeqPos range_start=kInvalidSeqPos,
238         TSeqPos range_end=kInvalidSeqPos);
239 
240     static size_t ParseRange    (const TStr& s, TSeqPos& start, TSeqPos& end, ILineErrorListener * pMessageListener);
241     virtual void   ParseTitle    (const SLineTextAndLoc & lineInfo, ILineErrorListener * pMessageListener);
242     virtual bool   IsValidLocalID(const TStr& s) const;
243     static bool IsValidLocalID(const TStr& idString, TFlags fFastaFlags);
244     virtual void   GenerateID    (void);
245     virtual void   ParseDataLine (const TStr& s, ILineErrorListener * pMessageListener);
246     virtual void   CheckDataLine (const TStr& s, ILineErrorListener * pMessageListener);
247     virtual void   x_CloseGap    (TSeqPos len, bool atStartOfLine, ILineErrorListener * pMessageListener);
248     virtual void   x_OpenMask    (void);
249     virtual void   x_CloseMask   (void);
250     virtual bool   ParseGapLine  (const TStr& s, ILineErrorListener * pMessageListener);
251     virtual void   AssembleSeq   (ILineErrorListener * pMessageListener);
252     virtual void   AssignMolType (ILineErrorListener * pMessageListener);
253     virtual bool   CreateWarningsForSeqDataInTitle(
254         const TStr& sLineText,
255         TSeqPos iLineNum,
256         ILineErrorListener * pMessageListener) const;
257     virtual void   PostWarning(ILineErrorListener * pMessageListener,
258             EDiagSev _eSeverity, size_t _uLineNum, CTempString _MessageStrmOps,
259             CObjReaderParseException::EErrCode _eErrCode,
260             ILineError::EProblem _eProblem,
261             CTempString _sFeature, CTempString _sQualName, CTempString _sQualValue) const;
262 
263     typedef int                         TRowNum;
264     typedef map<TRowNum, TSignedSeqPos> TSubMap;
265     // align coord -> row -> seq coord
266     typedef map<TSeqPos, TSubMap>       TStartsMap;
267     typedef vector<CRef<CSeq_id> >      TIds;
268 
269     CRef<CSeq_entry> x_ReadSeqsToAlign(TIds& ids, ILineErrorListener * pMessageListener);
270     void             x_AddPairwiseAlignments(CSeq_annot& annot, const TIds& ids,
271                                              TRowNum reference_row);
272     void             x_AddMultiwayAlignment(CSeq_annot& annot, const TIds& ids);
273 
274     // inline utilities
CloseGap(bool atStartOfLine=true,ILineErrorListener * pMessageListener=0)275     void CloseGap(bool atStartOfLine=true, ILineErrorListener * pMessageListener = 0) {
276         if (m_CurrentGapLength > 0) {
277             x_CloseGap(m_CurrentGapLength, atStartOfLine, pMessageListener);
278             m_CurrentGapLength = 0;
279         }
280     }
281     void OpenMask(void);
CloseMask(void)282     void CloseMask(void)
283         { if (m_MaskRangeStart != kInvalidSeqPos) { x_CloseMask(); } }
StreamPosition(void) const284     Int8 StreamPosition(void) const
285         { return NcbiStreamposToInt8(m_LineReader->GetPosition()); }
LineNumber(void) const286     Uint8 LineNumber(void) const
287         { return m_LineReader->GetLineNumber(); }
288 
GetLineReader(void)289     ILineReader&  GetLineReader(void)         { return *m_LineReader; }
TestFlag(EFlags flag) const290     bool          TestFlag(EFlags flag) const
291         { return (GetFlags() & flag) != 0; }
TestFlag(EInternalFlags flag) const292     bool          TestFlag(EInternalFlags flag) const
293         { return (GetFlags() & flag) != 0; }
SetIDs(void)294     CBioseq::TId& SetIDs(void)                { return m_CurrentSeq->SetId(); }
295 
296     // NB: Not necessarily fully-formed!
SetCurrentSeq(void)297     CBioseq& SetCurrentSeq(void)              { return *m_CurrentSeq; }
298 
299     enum EPosType {
300         eRawPos,
301         ePosWithGaps,
302         ePosWithGapsAndSegs
303     };
304     TSeqPos GetCurrentPos(EPosType pos_type);
305 
306     std::string x_NucOrProt(void) const;
307 
308 private:
309     CModHandler m_ModHandler;
310 
311     void x_ApplyMods(const string& title,
312                      TSeqPos line_number,
313                      CBioseq& bioseq,
314                      ILineErrorListener* pMessageListener);
315 
316     void x_SetDeflineParseInfo(SDefLineParseInfo& info);
317 
318     bool m_bModifiedMaxIdLength=false;
319 
320 protected:
321     struct SGap : public CObject {
322         enum EKnownSize {
323             eKnownSize_No,
324             eKnownSize_Yes
325         };
326 
327         // This lets us represent a "not-set" state for CSeq_gap::EType
328         // as an unset pointer.  This is needed because CSeq_gap::EType doesn't
329         // have a "not_set" enum value at this time.
330         typedef CObjectFor<CSeq_gap::EType> TGapTypeObj;
331         typedef CConstRef<TGapTypeObj> TNullableGapType;
332 
333         SGap(
334             TSeqPos pos,
335             TSignedSeqPos len, // signed so we can catch negative numbers and throw an exception
336             EKnownSize eKnownSize,
337             Uint8 uLineNumber,
338             TNullableGapType pGapType = TNullableGapType(),
339             const set<CLinkage_evidence::EType>& setOfLinkageEvidence =
340                 set<CLinkage_evidence::EType>());
341         // immutable once created
342 
343         // 0-based, and NOT counting previous gaps
344         const TSeqPos m_uPos;
345         // 0: unknown, negative: negated nominal length of unknown gap size.
346         // positive: known gap size
347         const TSeqPos m_uLen;
348         const EKnownSize m_eKnownSize;
349         const Uint8 m_uLineNumber;
350         // NULL means "no gap type specified"
351         const TNullableGapType m_pGapType;
352         typedef set<CLinkage_evidence::EType> TLinkEvidSet;
353         const TLinkEvidSet m_setOfLinkageEvidence;
354     private:
355         // forbid assignment and copy-construction
356         SGap & operator = (const SGap & );
357         SGap(const SGap & rhs);
358     };
359 
360     CRef<CSeq_align> xCreateAlignment(CRef<CSeq_id> old_id,
361         CRef<CSeq_id> new_id,
362         TSeqPos range_start,
363         TSeqPos range_end);
364 
365     bool xSetSeqMol(const list<CRef<CSeq_id>>& ids, CSeq_inst_Base::EMol& mol);
366 
367     typedef CRef<SGap> TGapRef;
368     typedef vector<TGapRef>     TGaps;
369     typedef set<CSeq_id_Handle> TIDTracker;
370 
371     CRef<ILineReader>       m_LineReader;
372     stack<TFlags>           m_Flags;
373     CRef<CBioseq>           m_CurrentSeq;
374     TMask                   m_CurrentMask;
375     TMask                   m_NextMask;
376     TMasks *                m_MaskVec;
377     CRef<CFastaIdHandler>   m_IDHandler;
378     string                  m_SeqData;
379     TGaps                   m_Gaps;
380     TSeqPos                 m_CurrentPos; // does not count gaps
381     TSeqPos                 m_MaskRangeStart;
382     TSeqPos                 m_SegmentBase;
383     TSeqPos                 m_CurrentGapLength;
384     TSeqPos                 m_TotalGapLength;
385     TSeqPos                 m_gapNmin;
386     TSeqPos                 m_gap_Unknown_length;
387     char                    m_CurrentGapChar;
388     CRef<CSeq_id>           m_BestID;
389     TStartsMap              m_Starts;
390     TRowNum                 m_Row;
391     TSeqPos                 m_Offset;
392     TIDTracker              m_IDTracker;
393     CSourceModParser::TMods m_BadMods;
394     CSourceModParser::TMods m_UnusedMods;
395     Uint4                   m_MaxIDLength;
396 
397 
398 
399     using TCountToLinkEvidMap = map<TSeqPos, SGap::TLinkEvidSet>;
400     TCountToLinkEvidMap     m_GapsizeToLinkageEvidence;
401 private:
402     SGap::TLinkEvidSet      m_DefaultLinkageEvidence;
403 protected:
404     SGap::TNullableGapType  m_gap_type;
405 
406     TSeqTitles m_CurrentSeqTitles;
407     std::vector<ILineError::EProblem> m_ignorable;
408     FIdCheck m_fIdCheck;
409     FModFilter m_fModFilter = nullptr;
410 };
411 
412 
413 typedef CFastaReader::TFlags TReadFastaFlags;
414 
415 
416 /// A const-correct replacement for the deprecated ReadFasta function
417 /// @sa CFastaReader
418 NCBI_XOBJREAD_EXPORT
419 CRef<CSeq_entry> ReadFasta(CNcbiIstream& in, CFastaReader::TFlags flags=0,
420                            int* counter = nullptr,
421                            CFastaReader::TMasks* lcv=nullptr,
422                            ILineErrorListener* pMessageListener=nullptr);
423 
424 //////////////////////////////////////////////////////////////////
425 //
426 // Class - description of multi-entry FASTA file,
427 // to keep list of offsets on all molecules in the file.
428 //
429 struct SFastaFileMap
430 {
431     struct SFastaEntry
432     {
SFastaEntrySFastaFileMap::SFastaEntry433         SFastaEntry() : stream_offset(0) {}
434         /// List of qll sequence ids
435         typedef list<string>  TFastaSeqIds;
436 
437         string         seq_id;        ///< Primary sequence Id
438         string         description;   ///< Molecule description
439         CNcbiStreampos stream_offset; ///< Molecule offset in file
440         TFastaSeqIds   all_seq_ids;   ///< List of all seq.ids
441     };
442 
443     typedef vector<SFastaEntry>  TMapVector;
444 
445     TMapVector   file_map; // vector keeps list of all molecule entries
446 };
447 
448 /// Callback interface to scan fasta file for entries
449 class NCBI_XOBJREAD_EXPORT IFastaEntryScan
450 {
451 public:
452     virtual ~IFastaEntryScan();
453 
454     /// Callback function, called after reading the fasta entry
455     virtual void EntryFound(CRef<CSeq_entry> se,
456                             CNcbiStreampos stream_position) = 0;
457 };
458 
459 
460 /// Function reads input stream (assumed that it is FASTA format) one
461 /// molecule entry after another filling the map structure describing and
462 /// pointing on molecule entries. Fasta map can be used later for quick
463 /// CSeq_entry retrival
464 void NCBI_XOBJREAD_EXPORT ReadFastaFileMap(SFastaFileMap* fasta_map,
465                                            CNcbiIfstream& input);
466 
467 /// Scan FASTA files, call IFastaEntryScan::EntryFound (payload function)
468 void NCBI_XOBJREAD_EXPORT ScanFastaFile(IFastaEntryScan* scanner,
469                                         CNcbiIfstream&   input,
470                                         CFastaReader::TFlags fread_flags);
471 
472 
473 /////////////////// CFastaReader inline methods
474 
475 
476 inline
OpenMask()477 void CFastaReader::OpenMask()
478 {
479     if (m_MaskRangeStart == kInvalidSeqPos  &&  m_CurrentMask.NotEmpty()) {
480         x_OpenMask();
481     }
482 }
483 
484 inline
GetCurrentPos(EPosType pos_type)485 TSeqPos CFastaReader::GetCurrentPos(EPosType pos_type)
486 {
487     TSeqPos pos = m_CurrentPos;
488     switch (pos_type) {
489     case ePosWithGapsAndSegs:
490         pos += m_SegmentBase;
491         // FALL THROUGH!!
492     case ePosWithGaps:
493         pos += m_TotalGapLength;
494         // FALL THROUGH!!
495     case eRawPos:
496         return pos;
497     default:
498         return kInvalidSeqPos;
499     }
500 }
501 
502 END_SCOPE(objects)
503 END_NCBI_SCOPE
504 
505 /* @} */
506 
507 #endif  /* OBJTOOLS_READERS___FASTA__HPP */
508