1 /* $Id: gene_info.hpp 631547 2021-05-19 13:51:35Z ivanov $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 * 26 * Author: Vahram Avagyan 27 * 28 */ 29 30 /// @file gene_info.hpp 31 /// Gene information class and related interfaces. 32 /// 33 /// Gene information is optionally presented in Blast output, along 34 /// with standard sequence deflines, in the form of one or more lines 35 /// describing the Gene, name of species, number of PubMed links, etc. 36 /// This file defines the Gene information class, the related exception 37 /// class, and the interface for obtaining Gene information from 38 /// an input source. 39 40 #ifndef OBJTOOLS_BLAST_GENE_INFO_READER___GENE_INFO__HPP 41 #define OBJTOOLS_BLAST_GENE_INFO_READER___GENE_INFO__HPP 42 43 //==========================================================================// 44 45 #include <corelib/ncbiexpt.hpp> 46 #include <corelib/ncbiobj.hpp> 47 #include <corelib/ncbistre.hpp> 48 49 BEGIN_NCBI_SCOPE 50 51 52 //==========================================================================// 53 54 /// CGeneInfoException 55 /// 56 /// Class describing an exception thrown by the Gene information classes. 57 /// 58 /// CGeneInfoException can be thrown while trying to read, process, or 59 /// output Gene information in any class declared in this header file, 60 /// classes derived from those, or other related classes. 61 62 class NCBI_XOBJREAD_EXPORT CGeneInfoException : public CException 63 { 64 public: 65 /// Error types for Gene Information processing. 66 enum EErrCode { 67 eInputError, //< Invalid user input 68 eNetworkError, //< Cannot access data via network 69 eDataFormatError, //< File format not recognized 70 eFileNotFoundError, //< File not found 71 eMemoryError, //< Not enough memory 72 eInternalError //< Internal/algorithmic error 73 }; 74 75 /// Translate from the error code value to its string representation. GetErrCodeString(void) const76 virtual const char* GetErrCodeString(void) const override 77 { 78 switch (GetErrCode()) 79 { 80 case eInputError: return "eInputError"; 81 case eNetworkError: return "eNetworkError"; 82 case eDataFormatError: return "eDataFormatError"; 83 case eFileNotFoundError: return "eFileNotFoundError"; 84 case eMemoryError: return "eMemoryError"; 85 case eInternalError: return "eInternalError"; 86 } 87 return CException::GetErrCodeString(); 88 } 89 90 /// Standard exception boilerplate code. 91 NCBI_EXCEPTION_DEFAULT(CGeneInfoException, CException); 92 }; 93 94 95 //==========================================================================// 96 97 /// CGeneInfo 98 /// 99 /// Gene information storage and formatted output. 100 /// 101 /// CGeneInfo is used to store and format Gene information. It contains 102 /// several basic fields from the Entrez Gene database, such as Gene 103 /// symbol, description, unique ID, etc. The class is derived from CObject 104 /// so that one can freely use CRefs with this class. 105 106 class NCBI_XOBJREAD_EXPORT CGeneInfo : public CObject 107 { 108 private: 109 /// Is the object properly initialized. 110 bool m_bIsInitialized; 111 112 /// Numeric unique Gene ID. 113 int m_nGeneId; 114 115 /// Official symbol of the Gene entry. 116 string m_strSymbol; 117 118 /// Description of the Gene. 119 string m_strDescription; 120 121 /// Scientific name of the organism (e.g. Sus scrofa). 122 string m_strOrgname; 123 124 /// Number of PubMed links for this entry. 125 int m_nPubMedLinks; 126 127 private: 128 /// Appends strSrc to strDest. 129 /// 130 /// The function makes sure that no single line 131 /// exceeds the maximum effective line length 132 /// (which is the actual number of characters 133 /// seen by the user, excluding HTML tags). 134 /// 135 /// @param strDest 136 /// Destination string to write to. 137 /// @param nCurLineEffLength 138 /// Length of the current line, the function 139 /// updates this variable as necessary. 140 /// @param strSrc 141 /// Source string to copy the characters from. 142 /// @param nSrcEffLength 143 /// Effective length of the source string, 144 /// excluding the HTML formatting tags, etc. 145 /// @param nMaxLineLength 146 /// Maximum allowed effective length for a line. 147 static void x_Append(string& strDest, 148 unsigned int& nCurLineEffLength, 149 const string& strSrc, 150 unsigned int nSrcEffLength, 151 unsigned int nMaxLineLength); 152 153 public: 154 /// Default constructor. 155 /// 156 /// This version of the constructor makes a default, 157 /// uninitialized Gene information object. 158 CGeneInfo(); 159 160 /// Constructor for initializing Gene information. 161 /// 162 /// This version of the constructor makes a fully initialized 163 /// Gene information object. 164 /// 165 /// @param nGeneId 166 /// Unique integer ID of the Gene entry. 167 /// @param strSymbol 168 /// Official symbol of the Gene entry. 169 /// @param strDescription 170 /// Description (full name) of the Gene entry. 171 /// @param strOrgName 172 /// Scientific name of the organism. 173 /// @param nPubMedLinks 174 /// Number (or estimate) of related PubMed links. 175 CGeneInfo(int nGeneId, 176 const string& strSymbol, 177 const string& strDescription, 178 const string& strOrgName, 179 int nPubMedLinks); 180 181 /// Destructor. 182 virtual ~CGeneInfo(); 183 184 /// Check if the object has been properly initialized. IsInitialized() const185 bool IsInitialized() const {return m_bIsInitialized;} 186 187 /// Get the numeric unique Gene ID. GetGeneId() const188 int GetGeneId() const {return m_nGeneId;} 189 190 /// Get the official symbol of the Gene entry. GetSymbol() const191 const string& GetSymbol() const {return m_strSymbol;} 192 193 /// Get the description of the Gene entry. GetDescription() const194 const string& GetDescription() const {return m_strDescription;} 195 196 /// Get the scientific name of the organism. GetOrganismName() const197 const string& GetOrganismName() const {return m_strOrgname;} 198 199 /// Get the number of PubMed links for this entry. GetNumPubMedLinks() const200 int GetNumPubMedLinks() const {return m_nPubMedLinks;} 201 202 /// Format the Gene information as a multiline string. 203 /// 204 /// This function combines all the Gene information in one string, 205 /// forming one or more lines not exceeding nMaxLineLength, 206 /// and adds several HTML elements, if requested. 207 /// 208 /// @param strGeneInfo 209 /// Destination string to fill with the Gene information. 210 /// @param bFormatAsHTML 211 /// This flag enables HTML formatting of the string, 212 /// which includes links to the actual Entrez Gene entry, 213 /// span tags for CSS processing, and so on. 214 /// @param nMaxLineLength 215 /// Maximum allowed effective length for a line (this excludes 216 /// HTML elements invisible to the user). If set to 0, 217 /// the function will use a reasonable default value. 218 void ToString(string& strGeneInfo, 219 bool bFormatAsHTML = false, 220 const string& strGeneLinkURL = "", 221 unsigned int nMaxLineLength = 0) const; 222 }; 223 224 /// Output the Gene information formatted as HTML. 225 NCBI_XOBJREAD_EXPORT 226 CNcbiOstream& operator<<(CNcbiOstream& out, const CGeneInfo& geneInfo); 227 228 229 //==========================================================================// 230 231 /// IGeneInfoInput 232 /// 233 /// Gene information retrieval interface. 234 /// 235 /// IGeneInfoInput defines the interface for obtaining Gene information 236 /// objects for a given Gi or a given Gene ID from any input source. 237 /// Additionally, the interface defines Gi to/from Gene ID conversions. 238 239 class NCBI_XOBJREAD_EXPORT IGeneInfoInput 240 { 241 public: 242 /// List of Gis. 243 typedef list<TGi> TGiList; 244 245 /// List of Gene IDs. 246 typedef list<int> TGeneIdList; 247 248 /// Gene ID to Gene Information map. 249 typedef map< int, CRef<CGeneInfo> > TGeneIdToGeneInfoMap; 250 251 /// List of Gene Information objects. 252 typedef vector< CRef<CGeneInfo> > TGeneInfoList; 253 254 public: 255 /// Destructor. ~IGeneInfoInput()256 virtual ~IGeneInfoInput() {} 257 258 /// Get all Gene IDs for a given Gi. 259 /// 260 /// Function takes a Gi and appends all available Gene IDs 261 /// for that Gi to the Gene ID list. Notice that some Gis 262 /// may be deliberately left out of the lookup process. 263 /// 264 /// @param gi 265 /// The Gi to look up. 266 /// @param geneIdList 267 /// The Gene ID list to append to. 268 /// @return 269 /// True if one or more Gene IDs were found for the Gi. 270 virtual bool 271 GetGeneIdsForGi(TGi gi, TGeneIdList& geneIdList) = 0; 272 273 /// Get all RNA Gis for a given Gene ID. 274 /// 275 /// Function takes a Gene ID and appends all available RNA Gis 276 /// for that Gene ID to the Gi list. 277 /// 278 /// @param geneId 279 /// The Gene ID to look up. 280 /// @param giList 281 /// The Gi list to append to. 282 /// @return 283 /// True if one or more Gis were found for the Gene ID. 284 virtual bool 285 GetRNAGisForGeneId(int geneId, TGiList& giList) = 0; 286 287 /// Get all Protein Gis for a given Gene ID. 288 /// 289 /// Function takes a Gene ID and appends all available Protein Gis 290 /// for that Gene ID to the Gi list. 291 /// 292 /// @param geneId 293 /// The Gene ID to look up. 294 /// @param giList 295 /// The Gi list to append to. 296 /// @return 297 /// True if one or more Gis were found for the Gene ID. 298 virtual bool 299 GetProteinGisForGeneId(int geneId, TGiList& giList) = 0; 300 301 /// Get all Genomic Gis for a given Gene ID. 302 /// 303 /// Function takes a Gene ID and appends all available Genomic Gis 304 /// for that Gene ID to the Gi list. 305 /// 306 /// @param geneId 307 /// The Gene ID to look up. 308 /// @param giList 309 /// The Gi list to append to. 310 /// @return 311 /// True if one or more Gis were found for the Gene ID. 312 virtual bool 313 GetGenomicGisForGeneId(int geneId, TGiList& giList) = 0; 314 315 /// Get all Gene Information objects for a given Gi. 316 /// 317 /// Function takes a Gi, looks it up and appends all available 318 /// Gene information objects to the given list. Notice that some Gis 319 /// may be deliberately left out of the lookup process. 320 /// 321 /// @param gi 322 /// The Gi to look up. 323 /// @param infoList 324 /// The Gene information list to append to. 325 /// @return 326 /// True if any Gene information was found for the Gi. 327 virtual bool 328 GetGeneInfoForGi(TGi gi, TGeneInfoList& infoList) = 0; 329 330 /// Get all Gene Information objects for a given Gene ID. 331 /// 332 /// Function takes a Gene ID, looks it up and appends all available 333 /// Gene information objects to the given list. 334 /// 335 /// @param geneId 336 /// The Gene ID to look up. 337 /// @param infoList 338 /// The Gene information list to append to. 339 /// @return 340 /// True if any Gene information was found for the Gene ID. 341 virtual bool 342 GetGeneInfoForId(int geneId, TGeneInfoList& infoList) = 0; 343 }; 344 345 //==========================================================================// 346 347 348 END_NCBI_SCOPE 349 350 #endif 351 352