1 #ifndef FEATURE_INDEXER__HPP
2 #define FEATURE_INDEXER__HPP
3 
4 /*
5 * ===========================================================================
6 *
7 *                            PUBLIC DOMAIN NOTICE
8 *               National Center for Biotechnology Information
9 *
10 *  This software/database is a "United States Government Work" under the
11 *  terms of the United States Copyright Act.  It was written as part of
12 *  the author's official duties as a United States Government employee and
13 *  thus cannot be copyrighted.  This software/database is freely available
14 *  to the public for use. The National Library of Medicine and the U.S.
15 *  Government have not placed any restriction on its use or reproduction.
16 *
17 *  Although all reasonable efforts have been taken to ensure the accuracy
18 *  and reliability of the software and data, the NLM and the U.S.
19 *  Government do not and cannot warrant the performance or results that
20 *  may be obtained by using this software or data. The NLM and the U.S.
21 *  Government disclaim all warranties, express or implied, including
22 *  warranties of performance, merchantability or fitness for any particular
23 *  purpose.
24 *
25 *  Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author:  Jonathan Kans
30 *
31 */
32 
33 #include <corelib/ncbicntr.hpp>
34 
35 #include <objects/general/Object_id.hpp>
36 #include <objects/seq/MolInfo.hpp>
37 #include <objects/seq/Seq_descr.hpp>
38 #include <objects/seq/Seq_gap.hpp>
39 #include <objects/seqfeat/BioSource.hpp>
40 #include <objects/submit/Seq_submit.hpp>
41 #include <objects/submit/Submit_block.hpp>
42 
43 #include <objmgr/object_manager.hpp>
44 #include <objmgr/seq_entry_handle.hpp>
45 #include <objmgr/seq_vector.hpp>
46 #include <objmgr/util/feature.hpp>
47 
48 BEGIN_NCBI_SCOPE
49 BEGIN_SCOPE(objects)
50 
51 
52 // look-ahead class names
53 class CSeqEntryIndex;
54 class CSeqMasterIndex;
55 class CSeqsetIndex;
56 class CBioseqIndex;
57 class CGapIndex;
58 class CDescriptorIndex;
59 class CFeatureIndex;
60 
61 typedef void (*FAddSnpFunc)(CBioseq_Handle bsh, string& na_acc);
62 
63 // CSeqEntryIndex
64 //
65 // CSeqEntryIndex is the public, top-level Seq-entry exploration organizer.  A variable
66 // is created using the top-level sequence object, with the constructors taking optional
67 // fetch policy and feature collection flags, as well as an optional feature exploration
68 // depth parameter (for the default adaptive fetch policy):
69 //
70 //   CSeqEntryIndex idx(*m_entry, CSeqEntryIndex::eAdaptive);
71 //
72 // A Seq-entry wrapper is created if the top-level object is a Bioseq or Bioseq-set.
73 // Bioseqs within the Seq-entry are then indexed and added to a vector of CBioseqIndex.
74 //
75 // Bioseqs are explored with IterateBioseqs, or selected individually by GetBioseqIndex
76 // (given an accession, index number, or subregion):
77 //
78 //   idx.IterateBioseqs("U54469", [this](CBioseqIndex& bsx) {
79 //       ...
80 //   });
81 //
82 // The embedded lambda function statements are executed for each selected Bioseq.
83 //
84 // Internal indexing objects (i.e., CSeqMasterIndex, CSeqsetIndex, CBioseqIndex,
85 // CDescriptorIndex, and CFeatureIndex) are generated by the indexing process, and
86 // should not be created by the application.
87 class NCBI_XOBJUTIL_EXPORT CSeqEntryIndex : public CObjectEx
88 {
89 public:
90 
91     enum EPolicy {
92         // far feature fetch policy
93         eAdaptive = 0,
94         eInternal = 1,
95         eExternal = 2,
96         eExhaustive = 3,
97         eFtp = 4,
98         eWeb = 5
99     };
100 
101     enum EFlags {
102         fDefault =           0,
103         fHideImpFeats =      1,
104         fHideSNPFeats =      2,
105         fHideCDDFeats =      4,
106         fHideSTSFeats =      8,
107         fHideExonFeats =    16,
108         fHideIntronFeats =  32,
109         fHideMiscFeats =    64,
110         fShowSNPFeats =    128,
111         fShowCDDFeats =    256,
112         fGeneRNACDSOnly =  512,
113         fHideGapFeats =   1024
114     };
115     typedef int TFlags; // Binary "OR" of EFlags
116 
117 public:
118     // Constructors take the top-level sequence object
119 
120     // The primary constructor uses an existing CScope created by the application
121     CSeqEntryIndex (CSeq_entry_Handle& topseh, EPolicy policy = eAdaptive, TFlags flags = fDefault);
122     CSeqEntryIndex (CBioseq_Handle& bsh, EPolicy policy = eAdaptive, TFlags flags = fDefault);
123 
124     // Alternative constructors take an object and create a new local default CScope
125     CSeqEntryIndex (CSeq_entry& topsep, EPolicy policy = eAdaptive, TFlags flags = fDefault);
126     CSeqEntryIndex (CBioseq_set& seqset, EPolicy policy = eAdaptive, TFlags flags = fDefault);
127     CSeqEntryIndex (CBioseq& bioseq, EPolicy policy = eAdaptive, TFlags flags = fDefault);
128     CSeqEntryIndex (CSeq_submit& submit, EPolicy policy = eAdaptive, TFlags flags = fDefault);
129 
130     // Specialized constructors are for streaming through release files, one component at a time
131 
132     // Submit-block obtained from top of Seq-submit release file
133     CSeqEntryIndex (CSeq_entry& topsep, CSubmit_block &sblock, EPolicy policy = eAdaptive, TFlags flags = fDefault);
134     // Seq-descr chain obtained from top of Bioseq-set release file
135     CSeqEntryIndex (CSeq_entry& topsep, CSeq_descr &descr, EPolicy policy = eAdaptive, TFlags flags = fDefault);
136 
137 private:
138     // Prohibit copy constructor & assignment operator
139     CSeqEntryIndex (const CSeqEntryIndex&) = delete;
140     CSeqEntryIndex& operator= (const CSeqEntryIndex&) = delete;
141 
142 public:
143     // Bioseq exploration iterator
144     template<typename Fnc> size_t IterateBioseqs (Fnc m);
145 
146     // GetBioseqIndex methods are provided for a variety of argument types
147 
148     // Get first Bioseq index
149     CRef<CBioseqIndex> GetBioseqIndex (void);
150     // Get Nth Bioseq index
151     CRef<CBioseqIndex> GetBioseqIndex (int n);
152     // Get Bioseq index by accession
153     CRef<CBioseqIndex> GetBioseqIndex (const string& accn);
154     // Get Bioseq index by handle
155     CRef<CBioseqIndex> GetBioseqIndex (CBioseq_Handle bsh);
156     // Get Bioseq index by mapped feature
157     CRef<CBioseqIndex> GetBioseqIndex (const CMappedFeat& mf);
158     // Get Bioseq index by sublocation
159     CRef<CBioseqIndex> GetBioseqIndex (const CSeq_loc& loc);
160 
161     // Seqset exploration iterator
162     template<typename Fnc> size_t IterateSeqsets (Fnc m);
163 
164     const vector<CRef<CBioseqIndex>>& GetBioseqIndices(void);
165 
166     const vector<CRef<CSeqsetIndex>>& GetSeqsetIndices(void);
167 
168     bool DistributedReferences(void);
169 
170     void SetSnpFunc(FAddSnpFunc* snp);
171 
172     FAddSnpFunc* GetSnpFunc(void);
173 
174     void SetFeatDepth(int featDepth);
175 
176     int GetFeatDepth(void);
177 
178     void SetGapDepth(int gapDepth);
179 
180     int GetGapDepth(void);
181 
182     // Check all Bioseqs for failure to fetch remote sequence components or feature annotation
183     bool IsFetchFailure(void);
184 
185     // Check for failure to create scope
186     bool IsIndexFailure (void);
187 
GetMasterIndex(void) const188     CRef<CSeqMasterIndex> GetMasterIndex(void) const { return m_Idx; }
189 
190 private:
191     // Implementation details are in a separate CSeqMasterIndex object wrapped in a CRef
192     CRef<CSeqMasterIndex> m_Idx;
193 };
194 
195 
196 // CSeqMasterIndex
197 //
198 // CSeqMasterIndex holds the implementation methods and variables for the CSeqEntryIndex
199 class NCBI_XOBJUTIL_EXPORT CSeqMasterIndex : public CObjectEx
200 {
201 public:
202     // Constructor is separate from Initializers so that CSeqEntryIndex can capture a CRef to
203     // its CSeqMasterIndex, making CWeakRef<CSeqMasterIndex> available to GetFeatureForProduct
CSeqMasterIndex(void)204     CSeqMasterIndex (void) { }
205 
206 public:
207     // Initializers take the top-level sequence object
208     void x_Initialize (CSeq_entry_Handle& topseh, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags);
209     void x_Initialize (CBioseq_Handle& bsh, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags);
210 
211     void x_Initialize (CSeq_entry& topsep, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags);
212     void x_Initialize (CBioseq_set& seqset, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags);
213     void x_Initialize (CBioseq& bioseq, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags);
214     void x_Initialize (CSeq_submit& submit, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags);
215 
216     void x_Initialize (CSeq_entry& topsep, CSubmit_block &sblock, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags);
217     void x_Initialize (CSeq_entry& topsep, CSeq_descr &descr, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags);
218 
219 private:
220     // Prohibit copy constructor & assignment operator
221     CSeqMasterIndex (const CSeqMasterIndex&) = delete;
222     CSeqMasterIndex& operator= (const CSeqMasterIndex&) = delete;
223 
224 public:
225     // Bioseq exploration iterator
226     template<typename Fnc> size_t IterateBioseqs (Fnc m);
227 
228     // Get first Bioseq index
229     CRef<CBioseqIndex> GetBioseqIndex (void);
230     // Get Nth Bioseq index
231     CRef<CBioseqIndex> GetBioseqIndex (int n);
232     // Get Bioseq index by accession
233     CRef<CBioseqIndex> GetBioseqIndex (const string& accn);
234     // Get Bioseq index by handle
235     CRef<CBioseqIndex> GetBioseqIndex (CBioseq_Handle bsh);
236     // Get Bioseq index by string
237     CRef<CBioseqIndex> GetBioseqIndex (string& str);
238     // Get Bioseq index by feature
239     CRef<CBioseqIndex> GetBioseqIndex (const CMappedFeat& mf);
240     // Get Bioseq index by sublocation
241     CRef<CBioseqIndex> GetBioseqIndex (const CSeq_loc& loc);
242 
243     // Seqset exploration iterator
244     template<typename Fnc> size_t IterateSeqsets (Fnc m);
245 
246     // Getters
GetObjectManager(void) const247     CRef<CObjectManager> GetObjectManager (void) const { return m_Objmgr; }
GetScope(void) const248     CRef<CScope> GetScope (void) const { return m_Scope; }
GetTopSEH(void) const249     CSeq_entry_Handle GetTopSEH (void) const { return m_Tseh; }
GetTopSEP(void) const250     CConstRef<CSeq_entry> GetTopSEP (void) const { return m_Tsep; }
GetSbtBlk(void) const251     CConstRef<CSubmit_block> GetSbtBlk (void) const { return m_SbtBlk; }
GetTopDescr(void) const252     CConstRef<CSeq_descr> GetTopDescr (void) const { return m_TopDescr; }
GetFeatTree(void)253     CRef<feature::CFeatTree> GetFeatTree (void) { return m_FeatTree; }
254 
255     const vector<CRef<CBioseqIndex>>& GetBioseqIndices(void);
256 
257     const vector<CRef<CSeqsetIndex>>& GetSeqsetIndices(void);
258 
SetHasOperon(bool hasOp)259     void SetHasOperon (bool hasOp) { m_HasOperon = hasOp; }
HasOperon(void) const260     bool HasOperon (void) const { return m_HasOperon; }
261 
IsSmallGenomeSet(void) const262     bool IsSmallGenomeSet (void) const { return m_IsSmallGenomeSet; }
263 
DistributedReferences(void) const264     bool DistributedReferences (void) const { return m_DistributedReferences; }
265 
266     void SetSnpFunc(FAddSnpFunc* snp);
267 
268     FAddSnpFunc* GetSnpFunc(void);
269 
270     void SetFeatDepth(int featDepth);
271 
272     int GetFeatDepth(void);
273 
274     void SetGapDepth(int gapDepth);
275 
276     int GetGapDepth(void);
277 
278     // Check all Bioseqs for failure to fetch remote sequence components or remote feature annotation
279     bool IsFetchFailure(void);
280 
281     // Check for failure to create scope
IsIndexFailure(void) const282     bool IsIndexFailure (void) const { return m_IndexFailure; }
SetIndexFailure(bool fails)283     void SetIndexFailure (bool fails) { m_IndexFailure = fails; }
284 
285 private:
286     // Common initialization function called by each Initialize variant
287     void x_Init (void);
288 
289     // Recursive exploration to populate vector of index objects for Bioseqs in Seq-entry
290     void x_InitSeqs (const CSeq_entry& sep, CRef<CSeqsetIndex> prnt, int level = 0);
291 
292 private:
293     CRef<CObjectManager> m_Objmgr;
294     CRef<CScope> m_Scope;
295     CSeq_entry_Handle m_Tseh;
296 
297     CConstRef<CSeq_entry> m_Tsep;
298     CConstRef<CSubmit_block> m_SbtBlk;
299     CConstRef<CSeq_descr> m_TopDescr;
300     CRef<feature::CFeatTree> m_FeatTree;
301 
302     CSeqEntryIndex::EPolicy m_Policy;
303     CSeqEntryIndex::TFlags m_Flags;
304 
305     vector<CRef<CBioseqIndex>> m_BsxList;
306 
307     // map from accession string to CBioseqIndex object
308     typedef map<string, CRef<CBioseqIndex> > TAccnIndexMap;
309     TAccnIndexMap m_AccnIndexMap;
310 
311     // map from CBioseq_Handle to CBioseqIndex object via best Seq-id string
312     typedef map<string, CRef<CBioseqIndex> > TBestIdIndexMap;
313     TBestIdIndexMap m_BestIdIndexMap;
314 
315     vector<CRef<CSeqsetIndex>> m_SsxList;
316 
317     bool m_HasOperon;
318     bool m_IsSmallGenomeSet;
319 
320     bool m_DistributedReferences;
321 
322     FAddSnpFunc* m_SnpFunc;
323 
324     int m_FeatDepth;
325     int m_GapDepth;
326 
327     mutable CAtomicCounter m_Counter;
328 
329     bool m_IndexFailure;
330 };
331 
332 
333 // CSeqsetIndex
334 //
335 // CSeqsetIndex stores information about an element in the Bioseq-set hierarchy
336 class NCBI_XOBJUTIL_EXPORT CSeqsetIndex : public CObjectEx
337 {
338 public:
339     // Constructor
340     CSeqsetIndex (CBioseq_set_Handle ssh,
341                   const CBioseq_set& bssp,
342                   CRef<CSeqsetIndex> prnt);
343 
344 private:
345     // Prohibit copy constructor & assignment operator
346     CSeqsetIndex (const CSeqsetIndex&) = delete;
347     CSeqsetIndex& operator= (const CSeqsetIndex&) = delete;
348 
349 public:
350     // Getters
GetSeqsetHandle(void) const351     CBioseq_set_Handle GetSeqsetHandle (void) const { return m_Ssh; }
GetSeqset(void) const352     const CBioseq_set& GetSeqset (void) const { return m_Bssp; }
GetParent(void) const353     CRef<CSeqsetIndex> GetParent (void) const { return m_Prnt; }
354 
GetClass(void) const355     CBioseq_set::TClass GetClass (void) const { return m_Class; }
356 
357 private:
358     CBioseq_set_Handle m_Ssh;
359     const CBioseq_set& m_Bssp;
360     CRef<CSeqsetIndex> m_Prnt;
361 
362     CBioseq_set::TClass m_Class;
363 };
364 
365 
366 // CBioseqIndex
367 //
368 // CBioseqIndex is the exploration organizer for a given Bioseq.  It provides methods to
369 // obtain descriptors and iterate through features that apply to the Bioseq.  (These are
370 // stored in vectors, which are initialized upon first request.)
371 //
372 // CBioseqIndex also maintains a CFeatTree for its Bioseq, used to find the best gene for
373 // each feature.
374 //
375 // Descriptors are explored with:
376 //
377 //   bsx.IterateDescriptors([this](CDescriptorIndex& sdx) {
378 //       ...
379 //   });
380 //
381 // and are presented based on the order of the descriptor chain hierarchy, starting with
382 // descriptors packaged on the Bioseq, then on its parent Bioseq-set, etc.
383 //
384 // Features are explored with:
385 //
386 //   bsx.IterateFeatures([this](CFeatureIndex& sfx) {
387 //       ...
388 //   });
389 //
390 // and are presented in order of biological position along the parent sequence.
391 //
392 // Fetching external features uses SAnnotSelector adaptive depth unless explicitly overridden.
393 class NCBI_XOBJUTIL_EXPORT CBioseqIndex : public CObjectEx
394 {
395 public:
396     // Constructor
397     CBioseqIndex (CBioseq_Handle bsh,
398                   const CBioseq& bsp,
399                   CBioseq_Handle obsh,
400                   CRef<CSeqsetIndex> prnt,
401                   CSeq_entry_Handle tseh,
402                   CRef<CScope> scope,
403                   CSeqMasterIndex& idx,
404                   CSeqEntryIndex::EPolicy policy,
405                   CSeqEntryIndex::TFlags flags);
406 
407     // Destructor
408     ~CBioseqIndex (void);
409 
410 private:
411     // Prohibit copy constructor & assignment operator
412     CBioseqIndex (const CBioseqIndex&) = delete;
413     CBioseqIndex& operator= (const CBioseqIndex&) = delete;
414 
415 public:
416     // Gap exploration iterator
417     template<typename Fnc> size_t IterateGaps (Fnc m);
418 
419     // Descriptor exploration iterator
420     template<typename Fnc> size_t IterateDescriptors (Fnc m);
421 
422     // Feature exploration iterator
423     template<typename Fnc> size_t IterateFeatures (Fnc m);
424     template<typename Fnc> size_t IterateFeatures (CSeq_loc& slp, Fnc m);
425 
426     // Getters
GetBioseqHandle(void) const427     CBioseq_Handle GetBioseqHandle (void) const { return m_Bsh; }
GetBioseq(void) const428     const CBioseq& GetBioseq (void) const { return m_Bsp; }
GetOrigBioseqHandle(void) const429     CBioseq_Handle GetOrigBioseqHandle (void) const { return m_OrigBsh; }
GetParent(void) const430     CRef<CSeqsetIndex> GetParent (void) const { return m_Prnt; }
GetScope(void) const431     CRef<CScope> GetScope (void) const { return m_Scope; }
GetSeqVector(void) const432     CRef<CSeqVector> GetSeqVector (void) const { return m_SeqVec; }
433 
434     // Get master index
GetSeqMasterIndex(void) const435     CWeakRef<CSeqMasterIndex> GetSeqMasterIndex (void) const { return m_Idx; }
436 
437     // Get sequence letters from Bioseq
438     string GetSequence (void);
439     void GetSequence (string& buffer);
440     // Get sequence letters from Bioseq subrange
441     string GetSequence (int from, int to);
442     void GetSequence (int from, int to, string& buffer);
443 
444     // Map from GetBestGene result to CFeatureIndex object
445     CRef<CFeatureIndex> GetFeatIndex (const CMappedFeat& mf);
446 
447     const vector<CRef<CGapIndex>>& GetGapIndices(void);
448 
449     const vector<CRef<CDescriptorIndex>>& GetDescriptorIndices(void);
450 
451     const vector<CRef<CFeatureIndex>>& GetFeatureIndices(void);
452 
453     // Get feature (CDS, mRNA, Prot) with product pointing to this Bioseq (protein, cDNA, peptide)
454     CRef<CFeatureIndex> GetFeatureForProduct(void);
455 
456     // Get Bioseq index containing feature with product pointing to this Bioseq
457     CWeakRef<CBioseqIndex> GetBioseqForProduct (void);
458 
459     // Get best (longest) protein feature on this protein Bioseq
460     CRef<CFeatureIndex> GetBestProteinFeature(void);
461 
462     // Flag to indicate failure to fetch remote sequence components or feature annotation
IsFetchFailure(void) const463     bool IsFetchFailure (void) const { return m_FetchFailure; }
464 
SetFetchFailure(bool fails)465     void SetFetchFailure (bool fails) { m_FetchFailure = fails; }
466 
467 public:
468     // Seq-inst fields
IsNA(void) const469     bool IsNA (void) const {  return m_IsNA; }
IsAA(void) const470     bool IsAA (void) const { return m_IsAA; }
GetTopology(void) const471     CSeq_inst::TTopology GetTopology (void) const { return m_Topology; }
GetLength(void) const472     CSeq_inst::TLength GetLength (void) const { return m_Length; }
473 
IsDelta(void) const474     bool IsDelta (void) const { return m_IsDelta; }
IsDeltaLitOnly(void) const475     bool IsDeltaLitOnly (void) const { return m_IsDeltaLitOnly; }
IsVirtual(void) const476     bool IsVirtual (void) const { return m_IsVirtual; }
IsMap(void) const477     bool IsMap (void) const { return m_IsMap; }
478 
479     // Seq-id fields
GetAccession(void) const480     const string& GetAccession (void) const { return m_Accession; }
481 
IsRefSeq(void) const482     bool IsRefSeq (void) const { return m_IsRefSeq; }
IsNC(void) const483     bool IsNC (void) const { return m_IsNC; }
IsNM(void) const484     bool IsNM (void) const { return m_IsNM; }
IsNR(void) const485     bool IsNR (void) const { return m_IsNR; }
IsNZ(void) const486     bool IsNZ (void) const { return m_IsNZ; }
IsPatent(void) const487     bool IsPatent (void) const { return m_IsPatent; }
IsPDB(void) const488     bool IsPDB (void) const { return m_IsPDB; }
IsWP(void) const489     bool IsWP (void) const { return m_IsWP; }
IsThirdParty(void) const490     bool IsThirdParty (void) const { return m_ThirdParty; }
IsWGSMaster(void) const491     bool IsWGSMaster (void) const { return m_WGSMaster; }
IsTSAMaster(void) const492     bool IsTSAMaster (void) const { return m_TSAMaster; }
IsTLSMaster(void) const493     bool IsTLSMaster (void) const { return m_TLSMaster; }
494 
GetGeneralStr(void) const495     string GetGeneralStr (void) const { return m_GeneralStr; }
GetGeneralId(void) const496     int GetGeneralId (void) const { return m_GeneralId; }
497 
GetPatentCountry(void) const498     string GetPatentCountry (void) const { return m_PatentCountry; }
GetPatentNumber(void) const499     string GetPatentNumber (void) const { return m_PatentNumber; }
GetPatentSequence(void) const500     int GetPatentSequence (void) const { return m_PatentSequence; }
501 
GetPDBChain(void) const502     int GetPDBChain (void) const { return m_PDBChain; }
GetPDBChainID(void) const503     string GetPDBChainID (void) const { return m_PDBChainID; }
504 
505     // Most important descriptor fields
506 
507     const string& GetTitle (void);
508 
509     CConstRef<CMolInfo> GetMolInfo (void);
510     CMolInfo::TBiomol GetBiomol (void);
511     CMolInfo::TTech GetTech (void);
512     CMolInfo::TCompleteness GetCompleteness (void);
513 
514     CConstRef<CBioSource> GetBioSource (void);
515     const string& GetTaxname (void);
516 
517     const string& GetDescTaxname (void);
518 
519     bool IsHTGTech (void);
520     bool IsHTGSUnfinished (void);
521     bool IsTLS (void);
522     bool IsTSA (void);
523     bool IsWGS (void);
524     bool IsEST_STS_GSS (void);
525 
526     bool IsUseBiosrc (void);
527 
528     const string& GetCommon (void);
529     const string& GetLineage (void);
530     TTaxId GetTaxid (void);
531     bool IsUsingAnamorph (void);
532 
533     CTempString GetGenus (void);
534     CTempString GetSpecies (void);
535     bool IsMultispecies (void);
536     CBioSource::TGenome GetGenome (void);
537     bool IsPlasmid (void);
538     bool IsChromosome (void);
539 
540     const string& GetOrganelle (void);
541 
542     string GetFirstSuperKingdom (void);
543     string GetSecondSuperKingdom (void);
544     bool IsCrossKingdom (void);
545 
546     CTempString GetChromosome (void);
547     CTempString GetLinkageGroup (void);
548     CTempString GetClone (void);
549     bool HasClone (void);
550     CTempString GetMap (void);
551     CTempString GetPlasmid (void);
552     CTempString GetSegment (void);
553 
554     CTempString GetBreed (void);
555     CTempString GetCultivar (void);
556     CTempString GetIsolate (void);
557     CTempString GetStrain (void);
558     CTempString GetSubstrain (void);
559     CTempString GetMetaGenomeSource (void);
560 
561     bool IsHTGSCancelled (void);
562     bool IsHTGSDraft (void);
563     bool IsHTGSPooled (void);
564     bool IsTPAExp (void);
565     bool IsTPAInf (void);
566     bool IsTPAReasm (void);
567     bool IsUnordered (void);
568 
569     CTempString GetPDBCompound (void);
570 
571     bool IsForceOnlyNearFeats (void);
572 
573     bool IsUnverified (void);
574     bool IsUnverifiedFeature (void);
575     bool IsUnverifiedOrganism (void);
576     bool IsUnverifiedMisassembled (void);
577     bool IsUnverifiedContaminant (void);
578 
579     CTempString GetTargetedLocus (void);
580 
581     const string& GetComment (void);
582     bool IsPseudogene (void);
583 
584     bool HasOperon (void);
585     bool HasGene (void);
586     bool HasMultiIntervalGenes (void);
587     bool HasSource (void);
588 
589     string GetrEnzyme (void);
590 
591 private:
592     // Common gap collection, delayed until actually needed
593     void x_InitGaps (void);
594 
595     // Common descriptor collection, delayed until actually needed
596     void x_InitDescs (void);
597 
598     // Common feature collection, delayed until actually needed
599     void x_InitFeats (void);
600     void x_InitFeats (CSeq_loc& slp);
601 
602     void x_DefaultSelector(SAnnotSelector& sel, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags, bool onlyNear, CScope& scope);
603 
604     // common implementation method
605     void x_InitFeats (CSeq_loc* slpp);
606 
607     // Set BioSource flags
608     void x_InitSource (void);
609 
610 private:
611     CBioseq_Handle m_Bsh;
612     const CBioseq& m_Bsp;
613     CBioseq_Handle m_OrigBsh;
614     CRef<CSeqsetIndex> m_Prnt;
615     CSeq_entry_Handle m_Tseh;
616     CRef<CScope> m_Scope;
617 
618     CWeakRef<CSeqMasterIndex> m_Idx;
619 
620     bool m_GapsInitialized;
621     vector<CRef<CGapIndex>> m_GapList;
622 
623     bool m_DescsInitialized;
624     vector<CRef<CDescriptorIndex>> m_SdxList;
625 
626     bool m_FeatsInitialized;
627     vector<CRef<CFeatureIndex>> m_SfxList;
628 
629     bool m_SourcesInitialized;
630 
631     bool m_FeatForProdInitialized;
632     CRef<CFeatureIndex> m_FeatureForProduct;
633 
634     bool m_BestProtFeatInitialized;
635     CRef<CFeatureIndex> m_BestProteinFeature;
636 
637     // CFeatureIndex from CMappedFeat for use with GetBestGene
638     typedef map<CMappedFeat, CRef<CFeatureIndex> > TFeatIndexMap;
639     TFeatIndexMap m_FeatIndexMap;
640 
641     CRef<CSeqVector> m_SeqVec;
642 
643     CSeqEntryIndex::EPolicy m_Policy;
644     CSeqEntryIndex::TFlags m_Flags;
645 
646     bool m_FetchFailure;
647 
648 private:
649     // Seq-inst fields
650     bool m_IsNA;
651     bool m_IsAA;
652     CSeq_inst::TTopology m_Topology;
653     CSeq_inst::TLength m_Length;
654 
655     bool m_IsDelta;
656     bool m_IsDeltaLitOnly;
657     bool m_IsVirtual;
658     bool m_IsMap;
659 
660     // Seq-id fields
661     string m_Accession;
662 
663     bool m_IsRefSeq;
664     bool m_IsNC;
665     bool m_IsNM;
666     bool m_IsNR;
667     bool m_IsNZ;
668     bool m_IsPatent;
669     bool m_IsPDB;
670     bool m_IsWP;
671     bool m_ThirdParty;
672     bool m_WGSMaster;
673     bool m_TSAMaster;
674     bool m_TLSMaster;
675 
676     string m_GeneralStr;
677     int m_GeneralId;
678 
679     string m_PatentCountry;
680     string m_PatentNumber;
681     int m_PatentSequence;
682 
683     int m_PDBChain;
684     string m_PDBChainID;
685 
686     // Instantiated title
687     string m_Title;
688 
689     // MolInfo fields
690     CConstRef<CMolInfo> m_MolInfo;
691     CMolInfo::TBiomol m_Biomol;
692     CMolInfo::TTech m_Tech;
693     CMolInfo::TCompleteness m_Completeness;
694 
695     bool m_HTGTech;
696     bool m_HTGSUnfinished;
697     bool m_IsTLS;
698     bool m_IsTSA;
699     bool m_IsWGS;
700     bool m_IsEST_STS_GSS;
701 
702     bool m_UseBiosrc;
703 
704     // BioSource fields
705     CConstRef<CBioSource> m_DescBioSource;
706     string m_DescTaxname;
707 
708     CConstRef<CBioSource> m_BioSource;
709     string m_Taxname;
710 
711     string m_Common;
712     string m_Lineage;
713     TTaxId m_Taxid;
714     bool m_UsingAnamorph;
715 
716     CTempString m_Genus;
717     CTempString m_Species;
718     bool m_Multispecies;
719     CBioSource::TGenome m_Genome;
720     bool m_IsPlasmid;
721     bool m_IsChromosome;
722 
723     string m_Organelle;
724 
725     string m_FirstSuperKingdom;
726     string m_SecondSuperKingdom;
727     bool m_IsCrossKingdom;
728 
729     // Subsource fields
730     CTempString m_Chromosome;
731     CTempString m_LinkageGroup;
732     CTempString m_Clone;
733     bool m_has_clone;
734     CTempString m_Map;
735     CTempString m_Plasmid;
736     CTempString m_Segment;
737 
738     // Orgmod fields
739     CTempString m_Breed;
740     CTempString m_Cultivar;
741     CTempString m_Isolate;
742     CTempString m_Strain;
743     CTempString m_Substrain;
744     CTempString m_MetaGenomeSource;
745 
746     // Keyword fields (genbank or embl blocks)
747     bool m_HTGSCancelled;
748     bool m_HTGSDraft;
749     bool m_HTGSPooled;
750     bool m_TPAExp;
751     bool m_TPAInf;
752     bool m_TPAReasm;
753     bool m_Unordered;
754 
755     // PDB block fields
756     CTempString m_PDBCompound;
757 
758     // User object fields
759     bool m_ForceOnlyNearFeats;
760 
761     bool m_IsUnverified;
762     bool m_IsUnverifiedFeature;
763     bool m_IsUnverifiedOrganism;
764     bool m_IsUnverifiedMisassembled;
765     bool m_IsUnverifiedContaminant;
766     CTempString m_UnverifiedPrefix;
767 
768     CTempString m_TargetedLocus;
769 
770     // Comment fields
771     string m_Comment;
772     bool m_IsPseudogene;
773 
774     // Feature fields
775     bool m_HasGene;
776     bool m_HasMultiIntervalGenes;
777     bool m_HasSource;
778 
779     // Map fields
780     string m_rEnzyme;
781 };
782 
783 
784 // CGapIndex
785 //
786 // CGapIndex stores information about an indexed descriptor
787 class NCBI_XOBJUTIL_EXPORT CGapIndex : public CObject
788 {
789 public:
790     // Constructor
791     CGapIndex (TSeqPos start,
792                TSeqPos end,
793                TSeqPos length,
794                const string& type,
795                const vector<string>& evidence,
796                bool isUnknownLength,
797                bool isAssemblyGap,
798                CBioseqIndex& bsx);
799 
800 private:
801     // Prohibit copy constructor & assignment operator
802     CGapIndex (const CGapIndex&) = delete;
803     CGapIndex& operator= (const CGapIndex&) = delete;
804 
805 public:
806     // Getters
807 
GetStart(void) const808     TSeqPos GetStart (void) const { return m_Start; }
GetEnd(void) const809     TSeqPos GetEnd (void) const { return m_End; }
GetLength(void) const810     TSeqPos GetLength (void) const { return m_Length; }
GetGapType(void) const811     const string GetGapType (void) const { return m_GapType; }
GetGapEvidence(void) const812     const vector<string>& GetGapEvidence (void) const { return m_GapEvidence; }
IsUnknownLength(void) const813     bool IsUnknownLength (void) const { return m_IsUnknownLength; }
IsAssemblyGap(void) const814     bool IsAssemblyGap (void) const { return m_IsAssemblyGap; }
815 
816     // Get parent Bioseq index
GetBioseqIndex(void) const817     CWeakRef<CBioseqIndex> GetBioseqIndex (void) const { return m_Bsx; }
818 
819 private:
820     CWeakRef<CBioseqIndex> m_Bsx;
821 
822     TSeqPos m_Start;
823     TSeqPos m_End;
824     TSeqPos m_Length;
825 
826     string m_GapType;
827     vector<string> m_GapEvidence;
828 
829     bool m_IsUnknownLength;
830     bool m_IsAssemblyGap;
831 };
832 
833 
834 // CDescriptorIndex
835 //
836 // CDescriptorIndex stores information about an indexed descriptor
837 class NCBI_XOBJUTIL_EXPORT CDescriptorIndex : public CObject
838 {
839 public:
840     // Constructor
841     CDescriptorIndex (const CSeqdesc& sd,
842                       CBioseqIndex& bsx);
843 
844 private:
845     // Prohibit copy constructor & assignment operator
846     CDescriptorIndex (const CDescriptorIndex&) = delete;
847     CDescriptorIndex& operator= (const CDescriptorIndex&) = delete;
848 
849 public:
850     // Getters
GetSeqDesc(void) const851     const CSeqdesc& GetSeqDesc (void) const { return m_Sd; }
852 
853     // Get parent Bioseq index
GetBioseqIndex(void) const854     CWeakRef<CBioseqIndex> GetBioseqIndex (void) const { return m_Bsx; }
855 
856     // Get descriptor type (e.g., CSeqdesc::e_Molinfo)
GetType(void) const857     CSeqdesc::E_Choice GetType (void) const { return m_Type; }
858 
859 private:
860     const CSeqdesc& m_Sd;
861     CWeakRef<CBioseqIndex> m_Bsx;
862 
863     CSeqdesc::E_Choice m_Type;
864 };
865 
866 
867 // CFeatureIndex
868 //
869 // CFeatureIndex stores information about an indexed feature
870 class NCBI_XOBJUTIL_EXPORT CFeatureIndex : public CObject
871 {
872 public:
873     // Constructor
874     CFeatureIndex (CSeq_feat_Handle sfh,
875                    const CMappedFeat mf,
876                    CConstRef<CSeq_loc> feat_loc,
877                    CBioseqIndex& bsx);
878 
879 private:
880     // Prohibit copy constructor & assignment operator
881     CFeatureIndex (const CFeatureIndex&) = delete;
882     CFeatureIndex& operator= (const CFeatureIndex&) = delete;
883 
884 public:
885     // Getters
GetSeqFeatHandle(void) const886     CSeq_feat_Handle GetSeqFeatHandle (void) const { return m_Sfh; }
GetMappedFeat(void) const887     const CMappedFeat GetMappedFeat (void) const { return m_Mf; }
GetSeqVector(void) const888     CRef<CSeqVector> GetSeqVector (void) const { return m_SeqVec; }
889 
GetMappedLocation(void) const890     CConstRef<CSeq_loc> GetMappedLocation(void) const { return m_Fl; }
891 
892     // Get parent Bioseq index
GetBioseqIndex(void) const893     CWeakRef<CBioseqIndex> GetBioseqIndex (void) const { return m_Bsx; }
894 
895     // Get feature type (e.g. CSeqFeatData::e_Rna)
GetType(void) const896     CSeqFeatData::E_Choice GetType (void) const { return m_Type; }
897 
898     // Get feature subtype (e.g. CSeqFeatData::eSubtype_mRNA)
GetSubtype(void) const899     CSeqFeatData::ESubtype GetSubtype (void) const { return m_Subtype; }
900 
GetStart(void) const901     TSeqPos GetStart (void) const { return m_Start; }
GetEnd(void) const902     TSeqPos GetEnd (void) const { return m_End; }
903 
904     // Get sequence letters under feature intervals
905     string GetSequence (void);
906     void GetSequence (string& buffer);
907     // Get sequence letters under feature subrange
908     string GetSequence (int from, int to);
909     void GetSequence (int from, int to, string& buffer);
910 
911     // Map from feature to CFeatureIndex for best gene using CFeatTree in parent CBioseqIndex
912     CRef<CFeatureIndex> GetBestGene (void);
913 
914     // Map from feature to CFeatureIndex for best VDJC parent using CFeatTree in parent CBioseqIndex
915     CRef<CFeatureIndex> GetBestParent (void);
916 
917     // Find CFeatureIndex object for overlapping source feature using internal CFeatTree
918     CRef<CFeatureIndex> GetOverlappingSource (void);
919 
920 private:
921     void SetFetchFailure (bool fails);
922 
923 private:
924     CSeq_feat_Handle m_Sfh;
925     const CMappedFeat m_Mf;
926     CConstRef<CSeq_loc> m_Fl;
927     CRef<CSeqVector> m_SeqVec;
928     CWeakRef<CBioseqIndex> m_Bsx;
929 
930     CSeqFeatData::E_Choice m_Type;
931     CSeqFeatData::ESubtype m_Subtype;
932 
933     TSeqPos m_Start;
934     TSeqPos m_End;
935 };
936 
937 
938 // CWordPairIndexer
939 //
940 // CWordPairIndexer generates normalized terms and adjacent word pairs for Entrez indexing
941 class NCBI_XOBJUTIL_EXPORT CWordPairIndexer
942 {
943 public:
944     // Constructor
CWordPairIndexer(void)945     CWordPairIndexer (void) { }
946 
947 private:
948     // Prohibit copy constructor & assignment operator
949     CWordPairIndexer (const CWordPairIndexer&) = delete;
950     CWordPairIndexer& operator= (const CWordPairIndexer&) = delete;
951 
952 public:
953     void PopulateWordPairIndex (string str);
954 
955     template<typename Fnc> void IterateNorm (Fnc m);
956     template<typename Fnc> void IteratePair (Fnc m);
957 
958 public:
959     static string ConvertUTF8ToAscii(const string& str);
960     static string TrimPunctuation (const string& str);
961     static string TrimMixedContent (const string& str);
962     static bool IsStopWord(const string& str);
963 
GetNorm(void) const964     const vector<string>& GetNorm (void) const { return m_Norm; }
GetPair(void) const965     const vector<string>& GetPair (void) const { return m_Pair; }
966 
967 private:
968     string x_AddToWordPairIndex (string item, string prev);
969 
970     vector<string> m_Norm;
971     vector<string> m_Pair;
972 };
973 
974 
975 // Inline lambda function implementations
976 
977 // Visit CBioseqIndex objects for all Bioseqs
978 template<typename Fnc>
979 inline
IterateBioseqs(Fnc m)980 size_t CSeqEntryIndex::IterateBioseqs (Fnc m)
981 
982 {
983     return m_Idx->IterateBioseqs(m);
984 }
985 
986 template<typename Fnc>
987 inline
IterateBioseqs(Fnc m)988 size_t CSeqMasterIndex::IterateBioseqs (Fnc m)
989 
990 {
991     int count = 0;
992     for (auto& bsx : m_BsxList) {
993         m(*bsx);
994         count++;
995     }
996     return count;
997 }
998 
999 // Visit CSeqsetIndex objects for all Seqsets
1000 template<typename Fnc>
1001 inline
IterateSeqsets(Fnc m)1002 size_t CSeqEntryIndex::IterateSeqsets (Fnc m)
1003 
1004 {
1005     return m_Idx->IterateSeqsets(m);
1006 }
1007 
1008 template<typename Fnc>
1009 inline
IterateSeqsets(Fnc m)1010 size_t CSeqMasterIndex::IterateSeqsets (Fnc m)
1011 
1012 {
1013     int count = 0;
1014     for (auto& ssx : m_SsxList) {
1015         m(*ssx);
1016         count++;
1017     }
1018     return count;
1019 }
1020 
1021 // Visit CGapIndex objects for all gaps
1022 template<typename Fnc>
1023 inline
IterateGaps(Fnc m)1024 size_t CBioseqIndex::IterateGaps (Fnc m)
1025 
1026 {
1027     int count = 0;
1028     try {
1029         // Delay gap collection until first request
1030         if (! m_GapsInitialized) {
1031             x_InitGaps();
1032         }
1033 
1034         for (auto& sgx : m_GapList) {
1035             count++;
1036             m(*sgx);
1037         }
1038     }
1039     catch (CException& e) {
1040         ERR_POST(Error << "Error in CBioseqIndex::IterateGaps: " << e.what());
1041     }
1042     return count;
1043 }
1044 
1045 // Visit CDescriptorIndex objects for all descriptors
1046 template<typename Fnc>
1047 inline
IterateDescriptors(Fnc m)1048 size_t CBioseqIndex::IterateDescriptors (Fnc m)
1049 
1050 {
1051     int count = 0;
1052     try {
1053         // Delay descriptor collection until first request
1054         if (! m_DescsInitialized) {
1055             x_InitDescs();
1056         }
1057 
1058         for (auto& sdx : m_SdxList) {
1059             count++;
1060             m(*sdx);
1061         }
1062     }
1063     catch (CException& e) {
1064         ERR_POST(Error << "Error in CBioseqIndex::IterateDescriptors: " << e.what());
1065     }
1066     return count;
1067 }
1068 
1069 // Visit CFeatureIndex objects for all features
1070 template<typename Fnc>
1071 inline
IterateFeatures(Fnc m)1072 size_t CBioseqIndex::IterateFeatures (Fnc m)
1073 
1074 {
1075     int count = 0;
1076     try {
1077         // Delay feature collection until first request
1078         if (! m_FeatsInitialized) {
1079             x_InitFeats();
1080         }
1081 
1082         for (auto& sfx : m_SfxList) {
1083             count++;
1084             m(*sfx);
1085         }
1086     }
1087     catch (CException& e) {
1088         ERR_POST(Error << "Error in CBioseqIndex::IterateFeatures: " << e.what());
1089     }
1090     return count;
1091 }
1092 
1093 template<typename Fnc>
1094 inline
IterateFeatures(CSeq_loc & slp,Fnc m)1095 size_t CBioseqIndex::IterateFeatures (CSeq_loc& slp, Fnc m)
1096 
1097 {
1098     int count = 0;
1099     try {
1100         // Delay feature collection until first request, but do not bail on m_FeatsInitialized flag
1101         x_InitFeats(slp);
1102 
1103         for (auto& sfx : m_SfxList) {
1104             count++;
1105             m(*sfx);
1106         }
1107     }
1108     catch (CException& e) {
1109         ERR_POST(Error << "Error in CBioseqIndex::IterateFeatures: " << e.what());
1110     }
1111     return count;
1112 }
1113 
1114 template<typename Fnc>
1115 inline
IterateNorm(Fnc m)1116 void CWordPairIndexer::IterateNorm (Fnc m)
1117 
1118 {
1119     for (auto& str : m_Norm) {
1120         m(str);
1121     }
1122 }
1123 
1124 template<typename Fnc>
1125 inline
IteratePair(Fnc m)1126 void CWordPairIndexer::IteratePair (Fnc m)
1127 
1128 {
1129     for (auto& str : m_Pair) {
1130         m(str);
1131     }
1132 }
1133 
1134 
1135 END_SCOPE(objects)
1136 END_NCBI_SCOPE
1137 
1138 #endif  /* FEATURE_INDEXER__HPP */
1139