1 #ifndef FEATURE_INDEXER__HPP
2 #define FEATURE_INDEXER__HPP
3
4 /*
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author: Jonathan Kans
30 *
31 */
32
33 #include <corelib/ncbicntr.hpp>
34
35 #include <objects/general/Object_id.hpp>
36 #include <objects/seq/MolInfo.hpp>
37 #include <objects/seq/Seq_descr.hpp>
38 #include <objects/seq/Seq_gap.hpp>
39 #include <objects/seqfeat/BioSource.hpp>
40 #include <objects/submit/Seq_submit.hpp>
41 #include <objects/submit/Submit_block.hpp>
42
43 #include <objmgr/object_manager.hpp>
44 #include <objmgr/seq_entry_handle.hpp>
45 #include <objmgr/seq_vector.hpp>
46 #include <objmgr/util/feature.hpp>
47
48 BEGIN_NCBI_SCOPE
49 BEGIN_SCOPE(objects)
50
51
52 // look-ahead class names
53 class CSeqEntryIndex;
54 class CSeqMasterIndex;
55 class CSeqsetIndex;
56 class CBioseqIndex;
57 class CGapIndex;
58 class CDescriptorIndex;
59 class CFeatureIndex;
60
61 typedef void (*FAddSnpFunc)(CBioseq_Handle bsh, string& na_acc);
62
63 // CSeqEntryIndex
64 //
65 // CSeqEntryIndex is the public, top-level Seq-entry exploration organizer. A variable
66 // is created using the top-level sequence object, with the constructors taking optional
67 // fetch policy and feature collection flags, as well as an optional feature exploration
68 // depth parameter (for the default adaptive fetch policy):
69 //
70 // CSeqEntryIndex idx(*m_entry, CSeqEntryIndex::eAdaptive);
71 //
72 // A Seq-entry wrapper is created if the top-level object is a Bioseq or Bioseq-set.
73 // Bioseqs within the Seq-entry are then indexed and added to a vector of CBioseqIndex.
74 //
75 // Bioseqs are explored with IterateBioseqs, or selected individually by GetBioseqIndex
76 // (given an accession, index number, or subregion):
77 //
78 // idx.IterateBioseqs("U54469", [this](CBioseqIndex& bsx) {
79 // ...
80 // });
81 //
82 // The embedded lambda function statements are executed for each selected Bioseq.
83 //
84 // Internal indexing objects (i.e., CSeqMasterIndex, CSeqsetIndex, CBioseqIndex,
85 // CDescriptorIndex, and CFeatureIndex) are generated by the indexing process, and
86 // should not be created by the application.
87 class NCBI_XOBJUTIL_EXPORT CSeqEntryIndex : public CObjectEx
88 {
89 public:
90
91 enum EPolicy {
92 // far feature fetch policy
93 eAdaptive = 0,
94 eInternal = 1,
95 eExternal = 2,
96 eExhaustive = 3,
97 eFtp = 4,
98 eWeb = 5
99 };
100
101 enum EFlags {
102 fDefault = 0,
103 fHideImpFeats = 1,
104 fHideSNPFeats = 2,
105 fHideCDDFeats = 4,
106 fHideSTSFeats = 8,
107 fHideExonFeats = 16,
108 fHideIntronFeats = 32,
109 fHideMiscFeats = 64,
110 fShowSNPFeats = 128,
111 fShowCDDFeats = 256,
112 fGeneRNACDSOnly = 512,
113 fHideGapFeats = 1024
114 };
115 typedef int TFlags; // Binary "OR" of EFlags
116
117 public:
118 // Constructors take the top-level sequence object
119
120 // The primary constructor uses an existing CScope created by the application
121 CSeqEntryIndex (CSeq_entry_Handle& topseh, EPolicy policy = eAdaptive, TFlags flags = fDefault);
122 CSeqEntryIndex (CBioseq_Handle& bsh, EPolicy policy = eAdaptive, TFlags flags = fDefault);
123
124 // Alternative constructors take an object and create a new local default CScope
125 CSeqEntryIndex (CSeq_entry& topsep, EPolicy policy = eAdaptive, TFlags flags = fDefault);
126 CSeqEntryIndex (CBioseq_set& seqset, EPolicy policy = eAdaptive, TFlags flags = fDefault);
127 CSeqEntryIndex (CBioseq& bioseq, EPolicy policy = eAdaptive, TFlags flags = fDefault);
128 CSeqEntryIndex (CSeq_submit& submit, EPolicy policy = eAdaptive, TFlags flags = fDefault);
129
130 // Specialized constructors are for streaming through release files, one component at a time
131
132 // Submit-block obtained from top of Seq-submit release file
133 CSeqEntryIndex (CSeq_entry& topsep, CSubmit_block &sblock, EPolicy policy = eAdaptive, TFlags flags = fDefault);
134 // Seq-descr chain obtained from top of Bioseq-set release file
135 CSeqEntryIndex (CSeq_entry& topsep, CSeq_descr &descr, EPolicy policy = eAdaptive, TFlags flags = fDefault);
136
137 private:
138 // Prohibit copy constructor & assignment operator
139 CSeqEntryIndex (const CSeqEntryIndex&) = delete;
140 CSeqEntryIndex& operator= (const CSeqEntryIndex&) = delete;
141
142 public:
143 // Bioseq exploration iterator
144 template<typename Fnc> size_t IterateBioseqs (Fnc m);
145
146 // GetBioseqIndex methods are provided for a variety of argument types
147
148 // Get first Bioseq index
149 CRef<CBioseqIndex> GetBioseqIndex (void);
150 // Get Nth Bioseq index
151 CRef<CBioseqIndex> GetBioseqIndex (int n);
152 // Get Bioseq index by accession
153 CRef<CBioseqIndex> GetBioseqIndex (const string& accn);
154 // Get Bioseq index by handle
155 CRef<CBioseqIndex> GetBioseqIndex (CBioseq_Handle bsh);
156 // Get Bioseq index by mapped feature
157 CRef<CBioseqIndex> GetBioseqIndex (const CMappedFeat& mf);
158 // Get Bioseq index by sublocation
159 CRef<CBioseqIndex> GetBioseqIndex (const CSeq_loc& loc);
160
161 // Seqset exploration iterator
162 template<typename Fnc> size_t IterateSeqsets (Fnc m);
163
164 const vector<CRef<CBioseqIndex>>& GetBioseqIndices(void);
165
166 const vector<CRef<CSeqsetIndex>>& GetSeqsetIndices(void);
167
168 bool DistributedReferences(void);
169
170 void SetSnpFunc(FAddSnpFunc* snp);
171
172 FAddSnpFunc* GetSnpFunc(void);
173
174 void SetFeatDepth(int featDepth);
175
176 int GetFeatDepth(void);
177
178 void SetGapDepth(int gapDepth);
179
180 int GetGapDepth(void);
181
182 // Check all Bioseqs for failure to fetch remote sequence components or feature annotation
183 bool IsFetchFailure(void);
184
185 // Check for failure to create scope
186 bool IsIndexFailure (void);
187
GetMasterIndex(void) const188 CRef<CSeqMasterIndex> GetMasterIndex(void) const { return m_Idx; }
189
190 private:
191 // Implementation details are in a separate CSeqMasterIndex object wrapped in a CRef
192 CRef<CSeqMasterIndex> m_Idx;
193 };
194
195
196 // CSeqMasterIndex
197 //
198 // CSeqMasterIndex holds the implementation methods and variables for the CSeqEntryIndex
199 class NCBI_XOBJUTIL_EXPORT CSeqMasterIndex : public CObjectEx
200 {
201 public:
202 // Constructor is separate from Initializers so that CSeqEntryIndex can capture a CRef to
203 // its CSeqMasterIndex, making CWeakRef<CSeqMasterIndex> available to GetFeatureForProduct
CSeqMasterIndex(void)204 CSeqMasterIndex (void) { }
205
206 public:
207 // Initializers take the top-level sequence object
208 void x_Initialize (CSeq_entry_Handle& topseh, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags);
209 void x_Initialize (CBioseq_Handle& bsh, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags);
210
211 void x_Initialize (CSeq_entry& topsep, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags);
212 void x_Initialize (CBioseq_set& seqset, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags);
213 void x_Initialize (CBioseq& bioseq, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags);
214 void x_Initialize (CSeq_submit& submit, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags);
215
216 void x_Initialize (CSeq_entry& topsep, CSubmit_block &sblock, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags);
217 void x_Initialize (CSeq_entry& topsep, CSeq_descr &descr, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags);
218
219 private:
220 // Prohibit copy constructor & assignment operator
221 CSeqMasterIndex (const CSeqMasterIndex&) = delete;
222 CSeqMasterIndex& operator= (const CSeqMasterIndex&) = delete;
223
224 public:
225 // Bioseq exploration iterator
226 template<typename Fnc> size_t IterateBioseqs (Fnc m);
227
228 // Get first Bioseq index
229 CRef<CBioseqIndex> GetBioseqIndex (void);
230 // Get Nth Bioseq index
231 CRef<CBioseqIndex> GetBioseqIndex (int n);
232 // Get Bioseq index by accession
233 CRef<CBioseqIndex> GetBioseqIndex (const string& accn);
234 // Get Bioseq index by handle
235 CRef<CBioseqIndex> GetBioseqIndex (CBioseq_Handle bsh);
236 // Get Bioseq index by string
237 CRef<CBioseqIndex> GetBioseqIndex (string& str);
238 // Get Bioseq index by feature
239 CRef<CBioseqIndex> GetBioseqIndex (const CMappedFeat& mf);
240 // Get Bioseq index by sublocation
241 CRef<CBioseqIndex> GetBioseqIndex (const CSeq_loc& loc);
242
243 // Seqset exploration iterator
244 template<typename Fnc> size_t IterateSeqsets (Fnc m);
245
246 // Getters
GetObjectManager(void) const247 CRef<CObjectManager> GetObjectManager (void) const { return m_Objmgr; }
GetScope(void) const248 CRef<CScope> GetScope (void) const { return m_Scope; }
GetTopSEH(void) const249 CSeq_entry_Handle GetTopSEH (void) const { return m_Tseh; }
GetTopSEP(void) const250 CConstRef<CSeq_entry> GetTopSEP (void) const { return m_Tsep; }
GetSbtBlk(void) const251 CConstRef<CSubmit_block> GetSbtBlk (void) const { return m_SbtBlk; }
GetTopDescr(void) const252 CConstRef<CSeq_descr> GetTopDescr (void) const { return m_TopDescr; }
GetFeatTree(void)253 CRef<feature::CFeatTree> GetFeatTree (void) { return m_FeatTree; }
254
255 const vector<CRef<CBioseqIndex>>& GetBioseqIndices(void);
256
257 const vector<CRef<CSeqsetIndex>>& GetSeqsetIndices(void);
258
SetHasOperon(bool hasOp)259 void SetHasOperon (bool hasOp) { m_HasOperon = hasOp; }
HasOperon(void) const260 bool HasOperon (void) const { return m_HasOperon; }
261
IsSmallGenomeSet(void) const262 bool IsSmallGenomeSet (void) const { return m_IsSmallGenomeSet; }
263
DistributedReferences(void) const264 bool DistributedReferences (void) const { return m_DistributedReferences; }
265
266 void SetSnpFunc(FAddSnpFunc* snp);
267
268 FAddSnpFunc* GetSnpFunc(void);
269
270 void SetFeatDepth(int featDepth);
271
272 int GetFeatDepth(void);
273
274 void SetGapDepth(int gapDepth);
275
276 int GetGapDepth(void);
277
278 // Check all Bioseqs for failure to fetch remote sequence components or remote feature annotation
279 bool IsFetchFailure(void);
280
281 // Check for failure to create scope
IsIndexFailure(void) const282 bool IsIndexFailure (void) const { return m_IndexFailure; }
SetIndexFailure(bool fails)283 void SetIndexFailure (bool fails) { m_IndexFailure = fails; }
284
285 private:
286 // Common initialization function called by each Initialize variant
287 void x_Init (void);
288
289 // Recursive exploration to populate vector of index objects for Bioseqs in Seq-entry
290 void x_InitSeqs (const CSeq_entry& sep, CRef<CSeqsetIndex> prnt, int level = 0);
291
292 private:
293 CRef<CObjectManager> m_Objmgr;
294 CRef<CScope> m_Scope;
295 CSeq_entry_Handle m_Tseh;
296
297 CConstRef<CSeq_entry> m_Tsep;
298 CConstRef<CSubmit_block> m_SbtBlk;
299 CConstRef<CSeq_descr> m_TopDescr;
300 CRef<feature::CFeatTree> m_FeatTree;
301
302 CSeqEntryIndex::EPolicy m_Policy;
303 CSeqEntryIndex::TFlags m_Flags;
304
305 vector<CRef<CBioseqIndex>> m_BsxList;
306
307 // map from accession string to CBioseqIndex object
308 typedef map<string, CRef<CBioseqIndex> > TAccnIndexMap;
309 TAccnIndexMap m_AccnIndexMap;
310
311 // map from CBioseq_Handle to CBioseqIndex object via best Seq-id string
312 typedef map<string, CRef<CBioseqIndex> > TBestIdIndexMap;
313 TBestIdIndexMap m_BestIdIndexMap;
314
315 vector<CRef<CSeqsetIndex>> m_SsxList;
316
317 bool m_HasOperon;
318 bool m_IsSmallGenomeSet;
319
320 bool m_DistributedReferences;
321
322 FAddSnpFunc* m_SnpFunc;
323
324 int m_FeatDepth;
325 int m_GapDepth;
326
327 mutable CAtomicCounter m_Counter;
328
329 bool m_IndexFailure;
330 };
331
332
333 // CSeqsetIndex
334 //
335 // CSeqsetIndex stores information about an element in the Bioseq-set hierarchy
336 class NCBI_XOBJUTIL_EXPORT CSeqsetIndex : public CObjectEx
337 {
338 public:
339 // Constructor
340 CSeqsetIndex (CBioseq_set_Handle ssh,
341 const CBioseq_set& bssp,
342 CRef<CSeqsetIndex> prnt);
343
344 private:
345 // Prohibit copy constructor & assignment operator
346 CSeqsetIndex (const CSeqsetIndex&) = delete;
347 CSeqsetIndex& operator= (const CSeqsetIndex&) = delete;
348
349 public:
350 // Getters
GetSeqsetHandle(void) const351 CBioseq_set_Handle GetSeqsetHandle (void) const { return m_Ssh; }
GetSeqset(void) const352 const CBioseq_set& GetSeqset (void) const { return m_Bssp; }
GetParent(void) const353 CRef<CSeqsetIndex> GetParent (void) const { return m_Prnt; }
354
GetClass(void) const355 CBioseq_set::TClass GetClass (void) const { return m_Class; }
356
357 private:
358 CBioseq_set_Handle m_Ssh;
359 const CBioseq_set& m_Bssp;
360 CRef<CSeqsetIndex> m_Prnt;
361
362 CBioseq_set::TClass m_Class;
363 };
364
365
366 // CBioseqIndex
367 //
368 // CBioseqIndex is the exploration organizer for a given Bioseq. It provides methods to
369 // obtain descriptors and iterate through features that apply to the Bioseq. (These are
370 // stored in vectors, which are initialized upon first request.)
371 //
372 // CBioseqIndex also maintains a CFeatTree for its Bioseq, used to find the best gene for
373 // each feature.
374 //
375 // Descriptors are explored with:
376 //
377 // bsx.IterateDescriptors([this](CDescriptorIndex& sdx) {
378 // ...
379 // });
380 //
381 // and are presented based on the order of the descriptor chain hierarchy, starting with
382 // descriptors packaged on the Bioseq, then on its parent Bioseq-set, etc.
383 //
384 // Features are explored with:
385 //
386 // bsx.IterateFeatures([this](CFeatureIndex& sfx) {
387 // ...
388 // });
389 //
390 // and are presented in order of biological position along the parent sequence.
391 //
392 // Fetching external features uses SAnnotSelector adaptive depth unless explicitly overridden.
393 class NCBI_XOBJUTIL_EXPORT CBioseqIndex : public CObjectEx
394 {
395 public:
396 // Constructor
397 CBioseqIndex (CBioseq_Handle bsh,
398 const CBioseq& bsp,
399 CBioseq_Handle obsh,
400 CRef<CSeqsetIndex> prnt,
401 CSeq_entry_Handle tseh,
402 CRef<CScope> scope,
403 CSeqMasterIndex& idx,
404 CSeqEntryIndex::EPolicy policy,
405 CSeqEntryIndex::TFlags flags);
406
407 // Destructor
408 ~CBioseqIndex (void);
409
410 private:
411 // Prohibit copy constructor & assignment operator
412 CBioseqIndex (const CBioseqIndex&) = delete;
413 CBioseqIndex& operator= (const CBioseqIndex&) = delete;
414
415 public:
416 // Gap exploration iterator
417 template<typename Fnc> size_t IterateGaps (Fnc m);
418
419 // Descriptor exploration iterator
420 template<typename Fnc> size_t IterateDescriptors (Fnc m);
421
422 // Feature exploration iterator
423 template<typename Fnc> size_t IterateFeatures (Fnc m);
424 template<typename Fnc> size_t IterateFeatures (CSeq_loc& slp, Fnc m);
425
426 // Getters
GetBioseqHandle(void) const427 CBioseq_Handle GetBioseqHandle (void) const { return m_Bsh; }
GetBioseq(void) const428 const CBioseq& GetBioseq (void) const { return m_Bsp; }
GetOrigBioseqHandle(void) const429 CBioseq_Handle GetOrigBioseqHandle (void) const { return m_OrigBsh; }
GetParent(void) const430 CRef<CSeqsetIndex> GetParent (void) const { return m_Prnt; }
GetScope(void) const431 CRef<CScope> GetScope (void) const { return m_Scope; }
GetSeqVector(void) const432 CRef<CSeqVector> GetSeqVector (void) const { return m_SeqVec; }
433
434 // Get master index
GetSeqMasterIndex(void) const435 CWeakRef<CSeqMasterIndex> GetSeqMasterIndex (void) const { return m_Idx; }
436
437 // Get sequence letters from Bioseq
438 string GetSequence (void);
439 void GetSequence (string& buffer);
440 // Get sequence letters from Bioseq subrange
441 string GetSequence (int from, int to);
442 void GetSequence (int from, int to, string& buffer);
443
444 // Map from GetBestGene result to CFeatureIndex object
445 CRef<CFeatureIndex> GetFeatIndex (const CMappedFeat& mf);
446
447 const vector<CRef<CGapIndex>>& GetGapIndices(void);
448
449 const vector<CRef<CDescriptorIndex>>& GetDescriptorIndices(void);
450
451 const vector<CRef<CFeatureIndex>>& GetFeatureIndices(void);
452
453 // Get feature (CDS, mRNA, Prot) with product pointing to this Bioseq (protein, cDNA, peptide)
454 CRef<CFeatureIndex> GetFeatureForProduct(void);
455
456 // Get Bioseq index containing feature with product pointing to this Bioseq
457 CWeakRef<CBioseqIndex> GetBioseqForProduct (void);
458
459 // Get best (longest) protein feature on this protein Bioseq
460 CRef<CFeatureIndex> GetBestProteinFeature(void);
461
462 // Flag to indicate failure to fetch remote sequence components or feature annotation
IsFetchFailure(void) const463 bool IsFetchFailure (void) const { return m_FetchFailure; }
464
SetFetchFailure(bool fails)465 void SetFetchFailure (bool fails) { m_FetchFailure = fails; }
466
467 public:
468 // Seq-inst fields
IsNA(void) const469 bool IsNA (void) const { return m_IsNA; }
IsAA(void) const470 bool IsAA (void) const { return m_IsAA; }
GetTopology(void) const471 CSeq_inst::TTopology GetTopology (void) const { return m_Topology; }
GetLength(void) const472 CSeq_inst::TLength GetLength (void) const { return m_Length; }
473
IsDelta(void) const474 bool IsDelta (void) const { return m_IsDelta; }
IsDeltaLitOnly(void) const475 bool IsDeltaLitOnly (void) const { return m_IsDeltaLitOnly; }
IsVirtual(void) const476 bool IsVirtual (void) const { return m_IsVirtual; }
IsMap(void) const477 bool IsMap (void) const { return m_IsMap; }
478
479 // Seq-id fields
GetAccession(void) const480 const string& GetAccession (void) const { return m_Accession; }
481
IsRefSeq(void) const482 bool IsRefSeq (void) const { return m_IsRefSeq; }
IsNC(void) const483 bool IsNC (void) const { return m_IsNC; }
IsNM(void) const484 bool IsNM (void) const { return m_IsNM; }
IsNR(void) const485 bool IsNR (void) const { return m_IsNR; }
IsNZ(void) const486 bool IsNZ (void) const { return m_IsNZ; }
IsPatent(void) const487 bool IsPatent (void) const { return m_IsPatent; }
IsPDB(void) const488 bool IsPDB (void) const { return m_IsPDB; }
IsWP(void) const489 bool IsWP (void) const { return m_IsWP; }
IsThirdParty(void) const490 bool IsThirdParty (void) const { return m_ThirdParty; }
IsWGSMaster(void) const491 bool IsWGSMaster (void) const { return m_WGSMaster; }
IsTSAMaster(void) const492 bool IsTSAMaster (void) const { return m_TSAMaster; }
IsTLSMaster(void) const493 bool IsTLSMaster (void) const { return m_TLSMaster; }
494
GetGeneralStr(void) const495 string GetGeneralStr (void) const { return m_GeneralStr; }
GetGeneralId(void) const496 int GetGeneralId (void) const { return m_GeneralId; }
497
GetPatentCountry(void) const498 string GetPatentCountry (void) const { return m_PatentCountry; }
GetPatentNumber(void) const499 string GetPatentNumber (void) const { return m_PatentNumber; }
GetPatentSequence(void) const500 int GetPatentSequence (void) const { return m_PatentSequence; }
501
GetPDBChain(void) const502 int GetPDBChain (void) const { return m_PDBChain; }
GetPDBChainID(void) const503 string GetPDBChainID (void) const { return m_PDBChainID; }
504
505 // Most important descriptor fields
506
507 const string& GetTitle (void);
508
509 CConstRef<CMolInfo> GetMolInfo (void);
510 CMolInfo::TBiomol GetBiomol (void);
511 CMolInfo::TTech GetTech (void);
512 CMolInfo::TCompleteness GetCompleteness (void);
513
514 CConstRef<CBioSource> GetBioSource (void);
515 const string& GetTaxname (void);
516
517 const string& GetDescTaxname (void);
518
519 bool IsHTGTech (void);
520 bool IsHTGSUnfinished (void);
521 bool IsTLS (void);
522 bool IsTSA (void);
523 bool IsWGS (void);
524 bool IsEST_STS_GSS (void);
525
526 bool IsUseBiosrc (void);
527
528 const string& GetCommon (void);
529 const string& GetLineage (void);
530 TTaxId GetTaxid (void);
531 bool IsUsingAnamorph (void);
532
533 CTempString GetGenus (void);
534 CTempString GetSpecies (void);
535 bool IsMultispecies (void);
536 CBioSource::TGenome GetGenome (void);
537 bool IsPlasmid (void);
538 bool IsChromosome (void);
539
540 const string& GetOrganelle (void);
541
542 string GetFirstSuperKingdom (void);
543 string GetSecondSuperKingdom (void);
544 bool IsCrossKingdom (void);
545
546 CTempString GetChromosome (void);
547 CTempString GetLinkageGroup (void);
548 CTempString GetClone (void);
549 bool HasClone (void);
550 CTempString GetMap (void);
551 CTempString GetPlasmid (void);
552 CTempString GetSegment (void);
553
554 CTempString GetBreed (void);
555 CTempString GetCultivar (void);
556 CTempString GetIsolate (void);
557 CTempString GetStrain (void);
558 CTempString GetSubstrain (void);
559 CTempString GetMetaGenomeSource (void);
560
561 bool IsHTGSCancelled (void);
562 bool IsHTGSDraft (void);
563 bool IsHTGSPooled (void);
564 bool IsTPAExp (void);
565 bool IsTPAInf (void);
566 bool IsTPAReasm (void);
567 bool IsUnordered (void);
568
569 CTempString GetPDBCompound (void);
570
571 bool IsForceOnlyNearFeats (void);
572
573 bool IsUnverified (void);
574 bool IsUnverifiedFeature (void);
575 bool IsUnverifiedOrganism (void);
576 bool IsUnverifiedMisassembled (void);
577 bool IsUnverifiedContaminant (void);
578
579 CTempString GetTargetedLocus (void);
580
581 const string& GetComment (void);
582 bool IsPseudogene (void);
583
584 bool HasOperon (void);
585 bool HasGene (void);
586 bool HasMultiIntervalGenes (void);
587 bool HasSource (void);
588
589 string GetrEnzyme (void);
590
591 private:
592 // Common gap collection, delayed until actually needed
593 void x_InitGaps (void);
594
595 // Common descriptor collection, delayed until actually needed
596 void x_InitDescs (void);
597
598 // Common feature collection, delayed until actually needed
599 void x_InitFeats (void);
600 void x_InitFeats (CSeq_loc& slp);
601
602 void x_DefaultSelector(SAnnotSelector& sel, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags, bool onlyNear, CScope& scope);
603
604 // common implementation method
605 void x_InitFeats (CSeq_loc* slpp);
606
607 // Set BioSource flags
608 void x_InitSource (void);
609
610 private:
611 CBioseq_Handle m_Bsh;
612 const CBioseq& m_Bsp;
613 CBioseq_Handle m_OrigBsh;
614 CRef<CSeqsetIndex> m_Prnt;
615 CSeq_entry_Handle m_Tseh;
616 CRef<CScope> m_Scope;
617
618 CWeakRef<CSeqMasterIndex> m_Idx;
619
620 bool m_GapsInitialized;
621 vector<CRef<CGapIndex>> m_GapList;
622
623 bool m_DescsInitialized;
624 vector<CRef<CDescriptorIndex>> m_SdxList;
625
626 bool m_FeatsInitialized;
627 vector<CRef<CFeatureIndex>> m_SfxList;
628
629 bool m_SourcesInitialized;
630
631 bool m_FeatForProdInitialized;
632 CRef<CFeatureIndex> m_FeatureForProduct;
633
634 bool m_BestProtFeatInitialized;
635 CRef<CFeatureIndex> m_BestProteinFeature;
636
637 // CFeatureIndex from CMappedFeat for use with GetBestGene
638 typedef map<CMappedFeat, CRef<CFeatureIndex> > TFeatIndexMap;
639 TFeatIndexMap m_FeatIndexMap;
640
641 CRef<CSeqVector> m_SeqVec;
642
643 CSeqEntryIndex::EPolicy m_Policy;
644 CSeqEntryIndex::TFlags m_Flags;
645
646 bool m_FetchFailure;
647
648 private:
649 // Seq-inst fields
650 bool m_IsNA;
651 bool m_IsAA;
652 CSeq_inst::TTopology m_Topology;
653 CSeq_inst::TLength m_Length;
654
655 bool m_IsDelta;
656 bool m_IsDeltaLitOnly;
657 bool m_IsVirtual;
658 bool m_IsMap;
659
660 // Seq-id fields
661 string m_Accession;
662
663 bool m_IsRefSeq;
664 bool m_IsNC;
665 bool m_IsNM;
666 bool m_IsNR;
667 bool m_IsNZ;
668 bool m_IsPatent;
669 bool m_IsPDB;
670 bool m_IsWP;
671 bool m_ThirdParty;
672 bool m_WGSMaster;
673 bool m_TSAMaster;
674 bool m_TLSMaster;
675
676 string m_GeneralStr;
677 int m_GeneralId;
678
679 string m_PatentCountry;
680 string m_PatentNumber;
681 int m_PatentSequence;
682
683 int m_PDBChain;
684 string m_PDBChainID;
685
686 // Instantiated title
687 string m_Title;
688
689 // MolInfo fields
690 CConstRef<CMolInfo> m_MolInfo;
691 CMolInfo::TBiomol m_Biomol;
692 CMolInfo::TTech m_Tech;
693 CMolInfo::TCompleteness m_Completeness;
694
695 bool m_HTGTech;
696 bool m_HTGSUnfinished;
697 bool m_IsTLS;
698 bool m_IsTSA;
699 bool m_IsWGS;
700 bool m_IsEST_STS_GSS;
701
702 bool m_UseBiosrc;
703
704 // BioSource fields
705 CConstRef<CBioSource> m_DescBioSource;
706 string m_DescTaxname;
707
708 CConstRef<CBioSource> m_BioSource;
709 string m_Taxname;
710
711 string m_Common;
712 string m_Lineage;
713 TTaxId m_Taxid;
714 bool m_UsingAnamorph;
715
716 CTempString m_Genus;
717 CTempString m_Species;
718 bool m_Multispecies;
719 CBioSource::TGenome m_Genome;
720 bool m_IsPlasmid;
721 bool m_IsChromosome;
722
723 string m_Organelle;
724
725 string m_FirstSuperKingdom;
726 string m_SecondSuperKingdom;
727 bool m_IsCrossKingdom;
728
729 // Subsource fields
730 CTempString m_Chromosome;
731 CTempString m_LinkageGroup;
732 CTempString m_Clone;
733 bool m_has_clone;
734 CTempString m_Map;
735 CTempString m_Plasmid;
736 CTempString m_Segment;
737
738 // Orgmod fields
739 CTempString m_Breed;
740 CTempString m_Cultivar;
741 CTempString m_Isolate;
742 CTempString m_Strain;
743 CTempString m_Substrain;
744 CTempString m_MetaGenomeSource;
745
746 // Keyword fields (genbank or embl blocks)
747 bool m_HTGSCancelled;
748 bool m_HTGSDraft;
749 bool m_HTGSPooled;
750 bool m_TPAExp;
751 bool m_TPAInf;
752 bool m_TPAReasm;
753 bool m_Unordered;
754
755 // PDB block fields
756 CTempString m_PDBCompound;
757
758 // User object fields
759 bool m_ForceOnlyNearFeats;
760
761 bool m_IsUnverified;
762 bool m_IsUnverifiedFeature;
763 bool m_IsUnverifiedOrganism;
764 bool m_IsUnverifiedMisassembled;
765 bool m_IsUnverifiedContaminant;
766 CTempString m_UnverifiedPrefix;
767
768 CTempString m_TargetedLocus;
769
770 // Comment fields
771 string m_Comment;
772 bool m_IsPseudogene;
773
774 // Feature fields
775 bool m_HasGene;
776 bool m_HasMultiIntervalGenes;
777 bool m_HasSource;
778
779 // Map fields
780 string m_rEnzyme;
781 };
782
783
784 // CGapIndex
785 //
786 // CGapIndex stores information about an indexed descriptor
787 class NCBI_XOBJUTIL_EXPORT CGapIndex : public CObject
788 {
789 public:
790 // Constructor
791 CGapIndex (TSeqPos start,
792 TSeqPos end,
793 TSeqPos length,
794 const string& type,
795 const vector<string>& evidence,
796 bool isUnknownLength,
797 bool isAssemblyGap,
798 CBioseqIndex& bsx);
799
800 private:
801 // Prohibit copy constructor & assignment operator
802 CGapIndex (const CGapIndex&) = delete;
803 CGapIndex& operator= (const CGapIndex&) = delete;
804
805 public:
806 // Getters
807
GetStart(void) const808 TSeqPos GetStart (void) const { return m_Start; }
GetEnd(void) const809 TSeqPos GetEnd (void) const { return m_End; }
GetLength(void) const810 TSeqPos GetLength (void) const { return m_Length; }
GetGapType(void) const811 const string GetGapType (void) const { return m_GapType; }
GetGapEvidence(void) const812 const vector<string>& GetGapEvidence (void) const { return m_GapEvidence; }
IsUnknownLength(void) const813 bool IsUnknownLength (void) const { return m_IsUnknownLength; }
IsAssemblyGap(void) const814 bool IsAssemblyGap (void) const { return m_IsAssemblyGap; }
815
816 // Get parent Bioseq index
GetBioseqIndex(void) const817 CWeakRef<CBioseqIndex> GetBioseqIndex (void) const { return m_Bsx; }
818
819 private:
820 CWeakRef<CBioseqIndex> m_Bsx;
821
822 TSeqPos m_Start;
823 TSeqPos m_End;
824 TSeqPos m_Length;
825
826 string m_GapType;
827 vector<string> m_GapEvidence;
828
829 bool m_IsUnknownLength;
830 bool m_IsAssemblyGap;
831 };
832
833
834 // CDescriptorIndex
835 //
836 // CDescriptorIndex stores information about an indexed descriptor
837 class NCBI_XOBJUTIL_EXPORT CDescriptorIndex : public CObject
838 {
839 public:
840 // Constructor
841 CDescriptorIndex (const CSeqdesc& sd,
842 CBioseqIndex& bsx);
843
844 private:
845 // Prohibit copy constructor & assignment operator
846 CDescriptorIndex (const CDescriptorIndex&) = delete;
847 CDescriptorIndex& operator= (const CDescriptorIndex&) = delete;
848
849 public:
850 // Getters
GetSeqDesc(void) const851 const CSeqdesc& GetSeqDesc (void) const { return m_Sd; }
852
853 // Get parent Bioseq index
GetBioseqIndex(void) const854 CWeakRef<CBioseqIndex> GetBioseqIndex (void) const { return m_Bsx; }
855
856 // Get descriptor type (e.g., CSeqdesc::e_Molinfo)
GetType(void) const857 CSeqdesc::E_Choice GetType (void) const { return m_Type; }
858
859 private:
860 const CSeqdesc& m_Sd;
861 CWeakRef<CBioseqIndex> m_Bsx;
862
863 CSeqdesc::E_Choice m_Type;
864 };
865
866
867 // CFeatureIndex
868 //
869 // CFeatureIndex stores information about an indexed feature
870 class NCBI_XOBJUTIL_EXPORT CFeatureIndex : public CObject
871 {
872 public:
873 // Constructor
874 CFeatureIndex (CSeq_feat_Handle sfh,
875 const CMappedFeat mf,
876 CConstRef<CSeq_loc> feat_loc,
877 CBioseqIndex& bsx);
878
879 private:
880 // Prohibit copy constructor & assignment operator
881 CFeatureIndex (const CFeatureIndex&) = delete;
882 CFeatureIndex& operator= (const CFeatureIndex&) = delete;
883
884 public:
885 // Getters
GetSeqFeatHandle(void) const886 CSeq_feat_Handle GetSeqFeatHandle (void) const { return m_Sfh; }
GetMappedFeat(void) const887 const CMappedFeat GetMappedFeat (void) const { return m_Mf; }
GetSeqVector(void) const888 CRef<CSeqVector> GetSeqVector (void) const { return m_SeqVec; }
889
GetMappedLocation(void) const890 CConstRef<CSeq_loc> GetMappedLocation(void) const { return m_Fl; }
891
892 // Get parent Bioseq index
GetBioseqIndex(void) const893 CWeakRef<CBioseqIndex> GetBioseqIndex (void) const { return m_Bsx; }
894
895 // Get feature type (e.g. CSeqFeatData::e_Rna)
GetType(void) const896 CSeqFeatData::E_Choice GetType (void) const { return m_Type; }
897
898 // Get feature subtype (e.g. CSeqFeatData::eSubtype_mRNA)
GetSubtype(void) const899 CSeqFeatData::ESubtype GetSubtype (void) const { return m_Subtype; }
900
GetStart(void) const901 TSeqPos GetStart (void) const { return m_Start; }
GetEnd(void) const902 TSeqPos GetEnd (void) const { return m_End; }
903
904 // Get sequence letters under feature intervals
905 string GetSequence (void);
906 void GetSequence (string& buffer);
907 // Get sequence letters under feature subrange
908 string GetSequence (int from, int to);
909 void GetSequence (int from, int to, string& buffer);
910
911 // Map from feature to CFeatureIndex for best gene using CFeatTree in parent CBioseqIndex
912 CRef<CFeatureIndex> GetBestGene (void);
913
914 // Map from feature to CFeatureIndex for best VDJC parent using CFeatTree in parent CBioseqIndex
915 CRef<CFeatureIndex> GetBestParent (void);
916
917 // Find CFeatureIndex object for overlapping source feature using internal CFeatTree
918 CRef<CFeatureIndex> GetOverlappingSource (void);
919
920 private:
921 void SetFetchFailure (bool fails);
922
923 private:
924 CSeq_feat_Handle m_Sfh;
925 const CMappedFeat m_Mf;
926 CConstRef<CSeq_loc> m_Fl;
927 CRef<CSeqVector> m_SeqVec;
928 CWeakRef<CBioseqIndex> m_Bsx;
929
930 CSeqFeatData::E_Choice m_Type;
931 CSeqFeatData::ESubtype m_Subtype;
932
933 TSeqPos m_Start;
934 TSeqPos m_End;
935 };
936
937
938 // CWordPairIndexer
939 //
940 // CWordPairIndexer generates normalized terms and adjacent word pairs for Entrez indexing
941 class NCBI_XOBJUTIL_EXPORT CWordPairIndexer
942 {
943 public:
944 // Constructor
CWordPairIndexer(void)945 CWordPairIndexer (void) { }
946
947 private:
948 // Prohibit copy constructor & assignment operator
949 CWordPairIndexer (const CWordPairIndexer&) = delete;
950 CWordPairIndexer& operator= (const CWordPairIndexer&) = delete;
951
952 public:
953 void PopulateWordPairIndex (string str);
954
955 template<typename Fnc> void IterateNorm (Fnc m);
956 template<typename Fnc> void IteratePair (Fnc m);
957
958 public:
959 static string ConvertUTF8ToAscii(const string& str);
960 static string TrimPunctuation (const string& str);
961 static string TrimMixedContent (const string& str);
962 static bool IsStopWord(const string& str);
963
GetNorm(void) const964 const vector<string>& GetNorm (void) const { return m_Norm; }
GetPair(void) const965 const vector<string>& GetPair (void) const { return m_Pair; }
966
967 private:
968 string x_AddToWordPairIndex (string item, string prev);
969
970 vector<string> m_Norm;
971 vector<string> m_Pair;
972 };
973
974
975 // Inline lambda function implementations
976
977 // Visit CBioseqIndex objects for all Bioseqs
978 template<typename Fnc>
979 inline
IterateBioseqs(Fnc m)980 size_t CSeqEntryIndex::IterateBioseqs (Fnc m)
981
982 {
983 return m_Idx->IterateBioseqs(m);
984 }
985
986 template<typename Fnc>
987 inline
IterateBioseqs(Fnc m)988 size_t CSeqMasterIndex::IterateBioseqs (Fnc m)
989
990 {
991 int count = 0;
992 for (auto& bsx : m_BsxList) {
993 m(*bsx);
994 count++;
995 }
996 return count;
997 }
998
999 // Visit CSeqsetIndex objects for all Seqsets
1000 template<typename Fnc>
1001 inline
IterateSeqsets(Fnc m)1002 size_t CSeqEntryIndex::IterateSeqsets (Fnc m)
1003
1004 {
1005 return m_Idx->IterateSeqsets(m);
1006 }
1007
1008 template<typename Fnc>
1009 inline
IterateSeqsets(Fnc m)1010 size_t CSeqMasterIndex::IterateSeqsets (Fnc m)
1011
1012 {
1013 int count = 0;
1014 for (auto& ssx : m_SsxList) {
1015 m(*ssx);
1016 count++;
1017 }
1018 return count;
1019 }
1020
1021 // Visit CGapIndex objects for all gaps
1022 template<typename Fnc>
1023 inline
IterateGaps(Fnc m)1024 size_t CBioseqIndex::IterateGaps (Fnc m)
1025
1026 {
1027 int count = 0;
1028 try {
1029 // Delay gap collection until first request
1030 if (! m_GapsInitialized) {
1031 x_InitGaps();
1032 }
1033
1034 for (auto& sgx : m_GapList) {
1035 count++;
1036 m(*sgx);
1037 }
1038 }
1039 catch (CException& e) {
1040 ERR_POST(Error << "Error in CBioseqIndex::IterateGaps: " << e.what());
1041 }
1042 return count;
1043 }
1044
1045 // Visit CDescriptorIndex objects for all descriptors
1046 template<typename Fnc>
1047 inline
IterateDescriptors(Fnc m)1048 size_t CBioseqIndex::IterateDescriptors (Fnc m)
1049
1050 {
1051 int count = 0;
1052 try {
1053 // Delay descriptor collection until first request
1054 if (! m_DescsInitialized) {
1055 x_InitDescs();
1056 }
1057
1058 for (auto& sdx : m_SdxList) {
1059 count++;
1060 m(*sdx);
1061 }
1062 }
1063 catch (CException& e) {
1064 ERR_POST(Error << "Error in CBioseqIndex::IterateDescriptors: " << e.what());
1065 }
1066 return count;
1067 }
1068
1069 // Visit CFeatureIndex objects for all features
1070 template<typename Fnc>
1071 inline
IterateFeatures(Fnc m)1072 size_t CBioseqIndex::IterateFeatures (Fnc m)
1073
1074 {
1075 int count = 0;
1076 try {
1077 // Delay feature collection until first request
1078 if (! m_FeatsInitialized) {
1079 x_InitFeats();
1080 }
1081
1082 for (auto& sfx : m_SfxList) {
1083 count++;
1084 m(*sfx);
1085 }
1086 }
1087 catch (CException& e) {
1088 ERR_POST(Error << "Error in CBioseqIndex::IterateFeatures: " << e.what());
1089 }
1090 return count;
1091 }
1092
1093 template<typename Fnc>
1094 inline
IterateFeatures(CSeq_loc & slp,Fnc m)1095 size_t CBioseqIndex::IterateFeatures (CSeq_loc& slp, Fnc m)
1096
1097 {
1098 int count = 0;
1099 try {
1100 // Delay feature collection until first request, but do not bail on m_FeatsInitialized flag
1101 x_InitFeats(slp);
1102
1103 for (auto& sfx : m_SfxList) {
1104 count++;
1105 m(*sfx);
1106 }
1107 }
1108 catch (CException& e) {
1109 ERR_POST(Error << "Error in CBioseqIndex::IterateFeatures: " << e.what());
1110 }
1111 return count;
1112 }
1113
1114 template<typename Fnc>
1115 inline
IterateNorm(Fnc m)1116 void CWordPairIndexer::IterateNorm (Fnc m)
1117
1118 {
1119 for (auto& str : m_Norm) {
1120 m(str);
1121 }
1122 }
1123
1124 template<typename Fnc>
1125 inline
IteratePair(Fnc m)1126 void CWordPairIndexer::IteratePair (Fnc m)
1127
1128 {
1129 for (auto& str : m_Pair) {
1130 m(str);
1131 }
1132 }
1133
1134
1135 END_SCOPE(objects)
1136 END_NCBI_SCOPE
1137
1138 #endif /* FEATURE_INDEXER__HPP */
1139