1 /*  $Id: gene_info.hpp 631547 2021-05-19 13:51:35Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Vahram Avagyan
27  *
28  */
29 
30 /// @file gene_info.hpp
31 /// Gene information class and related interfaces.
32 ///
33 /// Gene information is optionally presented in Blast output, along
34 /// with standard sequence deflines, in the form of one or more lines
35 /// describing the Gene, name of species, number of PubMed links, etc.
36 /// This file defines the Gene information class, the related exception
37 /// class, and the interface for obtaining Gene information from
38 /// an input source.
39 
40 #ifndef OBJTOOLS_BLAST_GENE_INFO_READER___GENE_INFO__HPP
41 #define OBJTOOLS_BLAST_GENE_INFO_READER___GENE_INFO__HPP
42 
43 //==========================================================================//
44 
45 #include <corelib/ncbiexpt.hpp>
46 #include <corelib/ncbiobj.hpp>
47 #include <corelib/ncbistre.hpp>
48 
49 BEGIN_NCBI_SCOPE
50 
51 
52 //==========================================================================//
53 
54 /// CGeneInfoException
55 ///
56 /// Class describing an exception thrown by the Gene information classes.
57 ///
58 /// CGeneInfoException can be thrown while trying to read, process, or
59 /// output Gene information in any class declared in this header file,
60 /// classes derived from those, or other related classes.
61 
62 class NCBI_XOBJREAD_EXPORT CGeneInfoException : public CException
63 {
64 public:
65     /// Error types for Gene Information processing.
66     enum EErrCode {
67         eInputError,            //< Invalid user input
68         eNetworkError,          //< Cannot access data via network
69         eDataFormatError,       //< File format not recognized
70         eFileNotFoundError,     //< File not found
71         eMemoryError,           //< Not enough memory
72         eInternalError          //< Internal/algorithmic error
73     };
74 
75     /// Translate from the error code value to its string representation.
GetErrCodeString(void) const76     virtual const char* GetErrCodeString(void) const override
77     {
78         switch (GetErrCode())
79         {
80             case eInputError:        return "eInputError";
81             case eNetworkError:      return "eNetworkError";
82             case eDataFormatError:   return "eDataFormatError";
83             case eFileNotFoundError: return "eFileNotFoundError";
84             case eMemoryError:       return "eMemoryError";
85             case eInternalError:     return "eInternalError";
86         }
87         return CException::GetErrCodeString();
88     }
89 
90     /// Standard exception boilerplate code.
91     NCBI_EXCEPTION_DEFAULT(CGeneInfoException, CException);
92 };
93 
94 
95 //==========================================================================//
96 
97 /// CGeneInfo
98 ///
99 /// Gene information storage and formatted output.
100 ///
101 /// CGeneInfo is used to store and format Gene information. It contains
102 /// several basic fields from the Entrez Gene database, such as Gene
103 /// symbol, description, unique ID, etc. The class is derived from CObject
104 /// so that one can freely use CRefs with this class.
105 
106 class NCBI_XOBJREAD_EXPORT CGeneInfo : public CObject
107 {
108 private:
109     /// Is the object properly initialized.
110     bool m_bIsInitialized;
111 
112     /// Numeric unique Gene ID.
113     int m_nGeneId;
114 
115     /// Official symbol of the Gene entry.
116     string m_strSymbol;
117 
118     /// Description of the Gene.
119     string m_strDescription;
120 
121     /// Scientific name of the organism (e.g. Sus scrofa).
122     string m_strOrgname;
123 
124     /// Number of PubMed links for this entry.
125     int m_nPubMedLinks;
126 
127 private:
128     /// Appends strSrc to strDest.
129     ///
130     /// The function makes sure that no single line
131     /// exceeds the maximum effective line length
132     /// (which is the actual number of characters
133     /// seen by the user, excluding HTML tags).
134     ///
135     /// @param strDest
136     ///     Destination string to write to.
137     /// @param nCurLineEffLength
138     ///     Length of the current line, the function
139     ///     updates this variable as necessary.
140     /// @param strSrc
141     ///     Source string to copy the characters from.
142     /// @param nSrcEffLength
143     ///     Effective length of the source string,
144     ///     excluding the HTML formatting tags, etc.
145     /// @param nMaxLineLength
146     ///     Maximum allowed effective length for a line.
147     static void x_Append(string& strDest,
148                          unsigned int& nCurLineEffLength,
149                          const string& strSrc,
150                          unsigned int nSrcEffLength,
151                          unsigned int nMaxLineLength);
152 
153 public:
154     /// Default constructor.
155     ///
156     /// This version of the constructor makes a default,
157     /// uninitialized Gene information object.
158     CGeneInfo();
159 
160     /// Constructor for initializing Gene information.
161     ///
162     /// This version of the constructor makes a fully initialized
163     /// Gene information object.
164     ///
165     /// @param nGeneId
166     ///     Unique integer ID of the Gene entry.
167     /// @param strSymbol
168     ///     Official symbol of the Gene entry.
169     /// @param strDescription
170     ///     Description (full name) of the Gene entry.
171     /// @param strOrgName
172     ///     Scientific name of the organism.
173     /// @param nPubMedLinks
174     ///     Number (or estimate) of related PubMed links.
175     CGeneInfo(int nGeneId,
176               const string& strSymbol,
177               const string& strDescription,
178               const string& strOrgName,
179               int nPubMedLinks);
180 
181     /// Destructor.
182     virtual ~CGeneInfo();
183 
184     /// Check if the object has been properly initialized.
IsInitialized() const185     bool IsInitialized() const {return m_bIsInitialized;}
186 
187     /// Get the numeric unique Gene ID.
GetGeneId() const188     int GetGeneId() const {return m_nGeneId;}
189 
190     /// Get the official symbol of the Gene entry.
GetSymbol() const191     const string& GetSymbol() const {return m_strSymbol;}
192 
193     /// Get the description of the Gene entry.
GetDescription() const194     const string& GetDescription() const {return m_strDescription;}
195 
196     /// Get the scientific name of the organism.
GetOrganismName() const197     const string& GetOrganismName() const {return m_strOrgname;}
198 
199     /// Get the number of PubMed links for this entry.
GetNumPubMedLinks() const200     int GetNumPubMedLinks() const {return m_nPubMedLinks;}
201 
202     /// Format the Gene information as a multiline string.
203     ///
204     /// This function combines all the Gene information in one string,
205     /// forming one or more lines not exceeding nMaxLineLength,
206     /// and adds several HTML elements, if requested.
207     ///
208     /// @param strGeneInfo
209     ///     Destination string to fill with the Gene information.
210     /// @param bFormatAsHTML
211     ///     This flag enables HTML formatting of the string,
212     ///     which includes links to the actual Entrez Gene entry,
213     ///     span tags for CSS processing, and so on.
214     /// @param nMaxLineLength
215     ///     Maximum allowed effective length for a line (this excludes
216     ///     HTML elements invisible to the user). If set to 0,
217     ///     the function will use a reasonable default value.
218     void ToString(string& strGeneInfo,
219                   bool bFormatAsHTML = false,
220                   const string& strGeneLinkURL = "",
221                   unsigned int nMaxLineLength = 0) const;
222 };
223 
224 /// Output the Gene information formatted as HTML.
225 NCBI_XOBJREAD_EXPORT
226 CNcbiOstream& operator<<(CNcbiOstream& out, const CGeneInfo& geneInfo);
227 
228 
229 //==========================================================================//
230 
231 /// IGeneInfoInput
232 ///
233 /// Gene information retrieval interface.
234 ///
235 /// IGeneInfoInput defines the interface for obtaining Gene information
236 /// objects for a given Gi or a given Gene ID from any input source.
237 /// Additionally, the interface defines Gi to/from Gene ID conversions.
238 
239 class NCBI_XOBJREAD_EXPORT IGeneInfoInput
240 {
241 public:
242     /// List of Gis.
243     typedef list<TGi>   TGiList;
244 
245     /// List of Gene IDs.
246     typedef list<int>   TGeneIdList;
247 
248     /// Gene ID to Gene Information map.
249     typedef map< int, CRef<CGeneInfo> > TGeneIdToGeneInfoMap;
250 
251     /// List of Gene Information objects.
252     typedef vector< CRef<CGeneInfo> > TGeneInfoList;
253 
254 public:
255     /// Destructor.
~IGeneInfoInput()256     virtual ~IGeneInfoInput() {}
257 
258     /// Get all Gene IDs for a given Gi.
259     ///
260     /// Function takes a Gi and appends all available Gene IDs
261     /// for that Gi to the Gene ID list. Notice that some Gis
262     /// may be deliberately left out of the lookup process.
263     ///
264     /// @param gi
265     ///     The Gi to look up.
266     /// @param geneIdList
267     ///     The Gene ID list to append to.
268     /// @return
269     ///     True if one or more Gene IDs were found for the Gi.
270     virtual bool
271         GetGeneIdsForGi(TGi gi, TGeneIdList& geneIdList) = 0;
272 
273     /// Get all RNA Gis for a given Gene ID.
274     ///
275     /// Function takes a Gene ID and appends all available RNA Gis
276     /// for that Gene ID to the Gi list.
277     ///
278     /// @param geneId
279     ///     The Gene ID to look up.
280     /// @param giList
281     ///     The Gi list to append to.
282     /// @return
283     ///     True if one or more Gis were found for the Gene ID.
284     virtual bool
285         GetRNAGisForGeneId(int geneId, TGiList& giList) = 0;
286 
287     /// Get all Protein Gis for a given Gene ID.
288     ///
289     /// Function takes a Gene ID and appends all available Protein Gis
290     /// for that Gene ID to the Gi list.
291     ///
292     /// @param geneId
293     ///     The Gene ID to look up.
294     /// @param giList
295     ///     The Gi list to append to.
296     /// @return
297     ///     True if one or more Gis were found for the Gene ID.
298     virtual bool
299         GetProteinGisForGeneId(int geneId, TGiList& giList) = 0;
300 
301     /// Get all Genomic Gis for a given Gene ID.
302     ///
303     /// Function takes a Gene ID and appends all available Genomic Gis
304     /// for that Gene ID to the Gi list.
305     ///
306     /// @param geneId
307     ///     The Gene ID to look up.
308     /// @param giList
309     ///     The Gi list to append to.
310     /// @return
311     ///     True if one or more Gis were found for the Gene ID.
312     virtual bool
313         GetGenomicGisForGeneId(int geneId, TGiList& giList) = 0;
314 
315     /// Get all Gene Information objects for a given Gi.
316     ///
317     /// Function takes a Gi, looks it up and appends all available
318     /// Gene information objects to the given list. Notice that some Gis
319     /// may be deliberately left out of the lookup process.
320     ///
321     /// @param gi
322     ///     The Gi to look up.
323     /// @param infoList
324     ///     The Gene information list to append to.
325     /// @return
326     ///     True if any Gene information was found for the Gi.
327     virtual bool
328         GetGeneInfoForGi(TGi gi, TGeneInfoList& infoList) = 0;
329 
330     /// Get all Gene Information objects for a given Gene ID.
331     ///
332     /// Function takes a Gene ID, looks it up and appends all available
333     /// Gene information objects to the given list.
334     ///
335     /// @param geneId
336     ///     The Gene ID to look up.
337     /// @param infoList
338     ///     The Gene information list to append to.
339     /// @return
340     ///     True if any Gene information was found for the Gene ID.
341     virtual bool
342         GetGeneInfoForId(int geneId, TGeneInfoList& infoList) = 0;
343 };
344 
345 //==========================================================================//
346 
347 
348 END_NCBI_SCOPE
349 
350 #endif
351 
352