1 /*  $Id: agp_seq_entry.hpp 632526 2021-06-02 17:25:01Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Mike DiCuccio, Michael Kornbluh
27  *
28  * File Description:
29  *     Convert an AGP file into a vector of Seq-entries
30  *
31  */
32 
33 #ifndef __OBJTOOLS_READERS_AGP_SEQ_ENTRY__HPP_
34 #define __OBJTOOLS_READERS_AGP_SEQ_ENTRY__HPP_
35 
36 #include <objtools/readers/agp_util.hpp>
37 
38 #include <objects/seq/Bioseq.hpp>
39 
40 BEGIN_NCBI_SCOPE
41 
42 namespace objects {
43     class CBioseq;
44     class CSeq_entry;
45     class CSeq_id;
46     class CSeq_gap;
47 }
48 
49 /// This class is used to turn an AGP file into a vector of Seq-entry's
50 class NCBI_XOBJREAD_EXPORT CAgpToSeqEntry : public CAgpReader {
51 public:
52 
53     /// This is the way the results will be returned
54     /// Each Seq-entry contains just one Bioseq, built from the AGP file(s).
55     typedef vector< CRef<objects::CSeq_entry> > TSeqEntryRefVec;
56 
57     enum EFlags {
58         /// Found gaps will not be given Seq-data such as Type and Linkage
59         fSetSeqGap     = (1 << 0),
60         /// All IDs will be treated as local IDs.
61         /// The default if this is NOT set is to first try to parse the ID,
62         /// and only make local if parsing fails.
63         fForceLocalId  = (1 << 1)
64     };
65     typedef int TFlags;
66 
67     /// After construction, you probably want to do something like
68     /// call ReadStream and then GetResult.
69     ///
70     /// @param agp_version
71     ///   What is the AGP version of the input?  Default is to auto-detect AGP version,
72     ///   which is likely what the user wants to do most of the time.
73     CAgpToSeqEntry( TFlags fFlags = 0,
74         EAgpVersion agp_version = eAgpVersion_auto,
75         CAgpErr* arg = NULL );
76 
77     /// This gets the results found, but don't call before finalizing.  We are intentionally
78     /// giving a non-const reference because the caller is free to
79     /// take the seq-entries inside and do whatever they like with them.
80     /// Each Seq-entry contains just one Bioseq, built from the AGP file(s).
GetResult(void)81     TSeqEntryRefVec & GetResult(void) { return m_entries; }
82 
83     /// This is the default method used to turn strings into Seq-ids in AGP contexts.
84     ///
85     /// @sa x_GetSeqIdFromStr
86     static CRef<objects::CSeq_id> s_DefaultSeqIdFromStr( const std::string & str );
87 
88     /// Turn a string into a local Seq-id (removing "lcl|" from the beginning if needed)
89     static CRef<objects::CSeq_id> s_LocalSeqIdFromStr( const std::string & str );
90 
91 protected:
92 
93     const TFlags m_fFlags;
94 
95     /// Builds new part of delta-seq in current bioseq, or adds bioseq
96     /// and starts building a new one.
97     virtual void OnGapOrComponent(void);
98 
99     /// Parent finalize plus making sure last m_bioseq is added.
100     virtual int Finalize(void);
101 
102     /// Our own finalization after parent's finalization.
103     void x_FinishedBioseq(void);
104 
105     /// If you must change exactly how strings are turned into Seq-ids,
106     /// you can override this in a subclass.  The default
107     // is to use s_DefaultSeqIdFromStr.
108     virtual CRef<objects::CSeq_id> x_GetSeqIdFromStr( const std::string & str );
109 
110     /// Fills in out_gap_info based on current CAgpRow
111     void x_SetSeqGap( objects::CSeq_gap & out_gap_info );
112 
113     /// This is the bioseq currently being built
114     CRef<objects::CBioseq> m_bioseq;
115     /// Holds the results
116     vector< CRef<objects::CSeq_entry> > m_entries;
117 
118 private:
119 
120     // forbid copy and assignment
121     CAgpToSeqEntry( const CAgpToSeqEntry & );
122     CAgpToSeqEntry & operator = (const CAgpToSeqEntry & );
123 };
124 
125 END_NCBI_SCOPE
126 
127 #endif // end of "include-guard"
128