1 #ifndef SRA__READER__SRA__CSRAREAD__HPP
2 #define SRA__READER__SRA__CSRAREAD__HPP
3 /*  $Id: csraread.hpp 593794 2019-09-24 17:56:58Z vasilche $
4  * ===========================================================================
5  *
6  *                            PUBLIC DOMAIN NOTICE
7  *               National Center for Biotechnology Information
8  *
9  *  This software/database is a "United States Government Work" under the
10  *  terms of the United States Copyright Act.  It was written as part of
11  *  the author's official duties as a United States Government employee and
12  *  thus cannot be copyrighted.  This software/database is freely available
13  *  to the public for use. The National Library of Medicine and the U.S.
14  *  Government have not placed any restriction on its use or reproduction.
15  *
16  *  Although all reasonable efforts have been taken to ensure the accuracy
17  *  and reliability of the software and data, the NLM and the U.S.
18  *  Government do not and cannot warrant the performance or results that
19  *  may be obtained by using this software or data. The NLM and the U.S.
20  *  Government disclaim all warranties, express or implied, including
21  *  warranties of performance, merchantability or fitness for any particular
22  *  purpose.
23  *
24  *  Please cite the author in any work or product based on this material.
25  *
26  * ===========================================================================
27  *
28  * Authors:  Eugene Vasilchenko
29  *
30  * File Description:
31  *   Access to cSRA files
32  *
33  */
34 
35 #include <corelib/ncbistd.hpp>
36 #include <corelib/ncbiobj.hpp>
37 #include <corelib/ncbimtx.hpp>
38 #include <util/range.hpp>
39 #include <sra/readers/sra/vdbread.hpp>
40 #include <objects/seq/seq_id_handle.hpp>
41 #include <objects/seq/Annotdesc.hpp>
42 #include <objects/seq/Seq_literal.hpp>
43 #include <map>
44 #include <list>
45 
46 //#include <insdc/sra.h> // for INSDC_coord_one, INSDC_coord_len, INSDC_read_filter
47 typedef int32_t INSDC_coord_one;
48 typedef uint32_t INSDC_coord_len;
49 typedef uint8_t INSDC_read_filter;
50 
51 BEGIN_NCBI_NAMESPACE;
52 BEGIN_NAMESPACE(objects);
53 
54 class CCSraRefSeqIterator;
55 class CCSraAlignIterator;
56 class CCSraShortReadIterator;
57 class CSeq_entry;
58 class CSeq_annot;
59 class CSeq_align;
60 class CSeq_graph;
61 class CBioseq;
62 class CUser_object;
63 class CUser_field;
64 class IIdMapper;
65 
66 struct SCSraDb_Defs
67 {
68     enum ERefIdType {
69         eRefId_SEQ_ID,
70         eRefId_gnl_NAME
71     };
72     enum EPathInIdType {
73         ePathInId_config,
74         ePathInId_yes,
75         ePathInId_no
76     };
77     enum EAlignType {
78         fPrimaryAlign   = 1<<0,
79         fSecondaryAlign = 1<<1,
80         fAnyAlign       = fPrimaryAlign | fSecondaryAlign
81     };
82     typedef EAlignType TAlignType;
83 };
84 
85 
86 class NCBI_SRAREAD_EXPORT CCSraDb_Impl : public CObject, public SCSraDb_Defs
87 {
88 public:
89     CCSraDb_Impl(CVDBMgr& mgr, const string& csra_path,
90                  IIdMapper* ref_id_mapper,
91                  ERefIdType ref_id_type,
92                  const string& sra_id_part);
93     virtual ~CCSraDb_Impl(void);
94 
95     // SRefInfo holds cached refseq information - ids, len, rows
96     struct SRefInfo {
97         string m_Name, m_SeqId;
98         mutable volatile TSeqPos m_SeqLength; // actual len will be updated
99         CBioseq::TId m_Seq_ids;
100         CSeq_id_Handle m_Seq_id_Handle;
GetMainSeq_idCCSraDb_Impl::SRefInfo101         CRef<CSeq_id>& GetMainSeq_id(void) {
102             return m_Seq_ids.front();
103         }
GetMainSeq_idCCSraDb_Impl::SRefInfo104         const CRef<CSeq_id>& GetMainSeq_id(void) const {
105             return m_Seq_ids.front();
106         }
GetMainSeq_id_HandleCCSraDb_Impl::SRefInfo107         const CSeq_id_Handle& GetMainSeq_id_Handle(void) const {
108             return m_Seq_id_Handle;
109         }
110         TVDBRowId m_RowFirst, m_RowLast;
111         bool m_Circular;
112         vector<TSeqPos> m_AlnOverStarts; // relative to m_RowFirst
113     };
114     typedef list<SRefInfo> TRefInfoList;
115     typedef map<string, TRefInfoList::iterator, PNocase> TRefInfoMapByName;
116     typedef map<CSeq_id_Handle, TRefInfoList::iterator> TRefInfoMapBySeq_id;
117 
GetRefInfoList(void) const118     const TRefInfoList& GetRefInfoList(void) const {
119         return m_RefList;
120     }
GetRefInfoMapByName(void) const121     const TRefInfoMapByName& GetRefInfoMapByName(void) const {
122         return m_RefMapByName;
123     }
GetRefInfoMapBySeq_id(void) const124     const TRefInfoMapBySeq_id& GetRefInfoMapBySeq_id(void) const {
125         return m_RefMapBySeq_id;
126     }
127 
GetRowSize(void) const128     TSeqPos GetRowSize(void) const {
129         return m_RowSize;
130     }
131 
132     typedef vector<string> TSpotGroups;
133     void GetSpotGroups(TSpotGroups& spot_groups);
134 
GetCSraPath(void) const135     const string& GetCSraPath(void) const {
136         return m_CSraPath;
137     }
138 
GetSraIdPart(void) const139     const string& GetSraIdPart(void) const {
140         return m_SraIdPart;
141     }
SetSraIdPart(const string & s)142     void SetSraIdPart(const string& s) {
143         m_SraIdPart = s;
144     }
145 
146     CRef<CSeq_id> MakeShortReadId(TVDBRowId id1, INSDC_coord_one id2) const;
147     void SetShortReadId(string& str, TVDBRowId id1, INSDC_coord_one id2) const;
148 
149 protected:
150     friend class CCSraRefSeqIterator;
151     friend class CCSraAlignIterator;
152     friend class CCSraShortReadIterator;
153 
154     void x_CalcSeqLength(const SRefInfo& info);
155 
156     // SRefTableCursor is helper accessor structure for refseq table
157     struct SRefTableCursor;
158     // SAlnTableCursor is helper accessor structure for align table
159     struct SAlnTableCursor;
160     // SSeqTableCursor is helper accessor structure for sequence table
161     struct SSeqTableCursor;
162     friend struct SSeqTableCursor;
163 
164     // get table accessor object for exclusive access
165     CRef<SRefTableCursor> Ref(void);
166     CRef<SAlnTableCursor> Aln(bool is_secondary);
167     CRef<SSeqTableCursor> Seq(void);
168     // return table accessor object for reuse
169     void Put(CRef<SRefTableCursor>& curs);
170     void Put(CRef<SAlnTableCursor>& curs);
171     void Put(CRef<SSeqTableCursor>& curs);
172 
173 protected:
174     void OpenRefTable(void);
175     void OpenAlnTable(bool is_secondary);
176     void OpenSeqTable(void);
177 
RefTable(void)178     const CVDBTable& RefTable(void) {
179         const CVDBTable& table = m_RefTable;
180         if ( !table ) {
181             OpenRefTable();
182         }
183         return table;
184     }
AlnTable(bool is_secondary)185     const CVDBTable& AlnTable(bool is_secondary) {
186         const CVDBTable& table = m_AlnTable[is_secondary];
187         if ( !table ) {
188             OpenAlnTable(is_secondary);
189         }
190         return table;
191     }
SeqTable(void)192     const CVDBTable& SeqTable(void) {
193         const CVDBTable& table = m_SeqTable;
194         if ( !table ) {
195             OpenSeqTable();
196         }
197         return table;
198     }
199 
200     void x_MakeRefSeq_ids(SRefInfo& info,
201                           IIdMapper* ref_id_mapper,
202                           int ref_id_type);
203 
204 private:
205     CVDBMgr m_Mgr;
206     CVDB m_Db;
207     string m_CSraPath;
208     string m_SraIdPart;
209 
210     CFastMutex m_TableMutex;
211     CFastMutex m_OverlapMutex;
212     CVDBTable m_RefTable;
213     CVDBTable m_AlnTable[2];
214     CVDBTable m_SeqTable;
215 
216     CVDBObjectCache<SRefTableCursor> m_Ref;
217     CVDBObjectCache<SAlnTableCursor> m_Aln[2];
218     CVDBObjectCache<SSeqTableCursor> m_Seq;
219 
220     TSeqPos m_RowSize; // cached size of refseq row in bases
221     TRefInfoList m_RefList; // list of cached refseqs' information
222     TRefInfoMapByName m_RefMapByName; // index for refseq info lookup
223     TRefInfoMapBySeq_id m_RefMapBySeq_id; // index for refseq info lookup
224 };
225 
226 
227 class CCSraDb : public CRef<CCSraDb_Impl>, public SCSraDb_Defs
228 {
229 public:
CCSraDb(void)230     CCSraDb(void)
231         {
232         }
233     NCBI_SRAREAD_EXPORT
234     CCSraDb(CVDBMgr& mgr,
235             const string& csra_path,
236             IIdMapper* ref_id_mapper = NULL,
237             ERefIdType ref_id_type = eRefId_SEQ_ID);
238     NCBI_SRAREAD_EXPORT
239     CCSraDb(CVDBMgr& mgr,
240             const string& csra_path,
241             const string& sra_id_part,
242             IIdMapper* ref_id_mapper = NULL,
243             ERefIdType ref_id_type = eRefId_SEQ_ID);
244 
245     NCBI_SRAREAD_EXPORT
246     static string MakeSraIdPart(EPathInIdType path_in_id_type,
247                                 const string& dir_path,
248                                 const string& csra_file);
249 
GetRowSize(void) const250     TSeqPos GetRowSize(void) const
251         {
252             return GetObject().GetRowSize();
253         }
254 
255     typedef CCSraDb_Impl::TSpotGroups TSpotGroups;
GetSpotGroups(TSpotGroups & spot_groups)256     void GetSpotGroups(TSpotGroups& spot_groups)
257         {
258             GetObject().GetSpotGroups(spot_groups);
259         }
260 };
261 
262 
263 class NCBI_SRAREAD_EXPORT CCSraRefSeqIterator : public SCSraDb_Defs
264 {
265 public:
CCSraRefSeqIterator(void)266     CCSraRefSeqIterator(void)
267         {
268         }
CCSraRefSeqIterator(const CCSraDb & csra_db,CCSraDb_Impl::TRefInfoList::const_iterator iter)269     CCSraRefSeqIterator(const CCSraDb& csra_db,
270                         CCSraDb_Impl::TRefInfoList::const_iterator iter)
271         : m_Db(csra_db),
272           m_Iter(iter)
273         {
274         }
275     explicit CCSraRefSeqIterator(const CCSraDb& csra_db);
276     NCBI_DEPRECATED
277     CCSraRefSeqIterator(const CCSraDb& csra_db, const string& seq_id);
278     enum EByName {
279         eByName
280     };
281     CCSraRefSeqIterator(const CCSraDb& csra_db, const string& name,
282                         EByName /*by_name*/);
283     CCSraRefSeqIterator(const CCSraDb& csra_db, const CSeq_id_Handle& seq_id);
284 
operator !(void) const285     bool operator!(void) const {
286         return !m_Db || m_Iter == m_Db->GetRefInfoList().end();
287     }
operator const void*(void) const288     operator const void*(void) const {
289         return !*this? 0: this;
290     }
291 
292     const CCSraDb_Impl::SRefInfo& GetInfo(void) const;
operator *(void) const293     const CCSraDb_Impl::SRefInfo& operator*(void) const {
294         return GetInfo();
295     }
operator ->(void) const296     const CCSraDb_Impl::SRefInfo* operator->(void) const {
297         return &GetInfo();
298     }
299 
operator ++(void)300     CCSraRefSeqIterator& operator++(void) {
301         ++m_Iter;
302         return *this;
303     }
304 
GetRefSeqId(void) const305     const string& GetRefSeqId(void) const {
306         return m_Iter->m_SeqId;
307     }
GetRefSeq_id(void) const308     CRef<CSeq_id> GetRefSeq_id(void) const {
309         return m_Iter->GetMainSeq_id();
310     }
GetRefSeq_id_Handle(void) const311     const CSeq_id_Handle& GetRefSeq_id_Handle(void) const {
312         return m_Iter->GetMainSeq_id_Handle();
313     }
GetRefSeq_ids(void) const314     const CBioseq::TId& GetRefSeq_ids(void) const {
315         return m_Iter->m_Seq_ids;
316     }
317 
318     bool IsCircular(void) const;
319 
320     TSeqPos GetSeqLength(void) const;
321 
322     NCBI_DEPRECATED
323     size_t GetRowAlignCount(TVDBRowId row) const;
324 
325     size_t GetAlignCountAtPos(TSeqPos pos, TAlignType type = fAnyAlign) const;
326 
327     CRef<CSeq_graph> GetCoverageGraph(void) const;
328     CRef<CSeq_annot> GetCoverageAnnot(void) const;
329     CRef<CSeq_annot> GetCoverageAnnot(const string& annot_name) const;
330 
331     CRef<CSeq_annot> GetSeq_annot(void) const;
332     CRef<CSeq_annot> GetSeq_annot(const string& annot_name) const;
333 
334     enum ELoadData {
335         eLoadData,
336         eOmitData
337     };
338     CRef<CBioseq> GetRefBioseq(ELoadData load = eLoadData) const;
339     typedef list< CRef<CSeq_literal> > TLiterals;
340     typedef CRange<TSeqPos> TRange;
341     void GetRefLiterals(TLiterals& literals,
342                         TRange range,
343                         ELoadData load = eLoadData) const;
344 
345     // return array of start position of alignmnets overlapping with each page
346     // return empty array if at most one page overlapping is allowed
347     const vector<TSeqPos>& GetAlnOverStarts(void) const;
348     // return first position that is surely not covered by alignments
349     // with starting position in the argument range
350     TSeqPos GetAlnOverToOpen(TRange range) const;
351 
352     // estimate number of alignments mapped to the reference sequence
353     Uint8 GetEstimatedNumberOfAlignments(void) const;
354 
355 protected:
356     friend class CCSraAlignIterator;
357 
358     CRef<CSeq_annot> x_GetSeq_annot(const string* annot_name) const;
359 
360     static CRef<CSeq_annot> MakeSeq_annot(const string& annot_name);
361 
GetDb(void) const362     CCSraDb_Impl& GetDb(void) const {
363         return m_Db.GetNCObject();
364     }
365 
366 private:
367     CCSraDb m_Db;
368     CCSraDb_Impl::TRefInfoList::const_iterator m_Iter;
369 };
370 
371 
372 class NCBI_SRAREAD_EXPORT CCSraAlignIterator : public SCSraDb_Defs
373 {
374 public:
375     CCSraAlignIterator(void);
376 
377     enum ESearchMode {
378         eSearchByOverlap,
379         eSearchByStart
380     };
381 
382     NCBI_DEPRECATED
383     CCSraAlignIterator(const CCSraDb& csra_db,
384                        const string& ref_id,
385                        TSeqPos ref_pos,
386                        TSeqPos window = 0,
387                        ESearchMode search_mode = eSearchByOverlap,
388                        TAlignType align_type = fAnyAlign);
389     CCSraAlignIterator(const CCSraDb& csra_db,
390                        const CSeq_id_Handle& ref_id,
391                        TSeqPos ref_pos,
392                        TSeqPos window,
393                        ESearchMode search_mode,
394                        TAlignType align_type = fAnyAlign);
395     CCSraAlignIterator(const CCSraDb& csra_db,
396                        const CSeq_id_Handle& ref_id,
397                        TSeqPos ref_pos,
398                        TSeqPos window = 0,
399                        TAlignType align_type = fAnyAlign);
400     ~CCSraAlignIterator(void);
401 
402     void Reset(void);
403     CCSraAlignIterator(const CCSraAlignIterator& iter);
404     CCSraAlignIterator& operator=(const CCSraAlignIterator& iter);
405 
406     void Select(TSeqPos ref_pos,
407                 TSeqPos window = 0,
408                 ESearchMode search_mode = eSearchByOverlap,
409                 TAlignType align_type = fAnyAlign);
410 
operator const void*(void) const411     operator const void*(void) const {
412         return m_Error? 0: this;
413     }
operator !(void) const414     bool operator!(void) const {
415         return m_Error != 0;
416     }
417 
operator ++(void)418     CCSraAlignIterator& operator++(void) {
419         x_Next();
420         return *this;
421     }
422 
423     TVDBRowId GetAlignmentId(void) const;
424 
IsSecondary(void) const425     bool IsSecondary(void) const {
426         return m_AlnRowIsSecondary;
427     }
428 
429     CTempString GetRefSeqId(void) const;
GetRefSeqPos(void) const430     TSeqPos GetRefSeqPos(void) const {
431         return m_CurRefPos;
432     }
GetRefSeqLen(void) const433     TSeqPos GetRefSeqLen(void) const {
434         return m_CurRefLen;
435     }
436     bool GetRefMinusStrand(void) const;
437 
438     int GetMapQuality(void) const;
439 
440     TVDBRowId GetShortId1(void) const;
441     INSDC_coord_one GetShortId2(void) const;
442     TSeqPos GetShortPos(void) const;
443     TSeqPos GetShortLen(void) const;
444 
445     CTempString GetSpotGroup(void) const;
446 
447     bool IsSetName(void) const;
448     CTempString GetName(void) const;
449 
450     INSDC_read_filter GetReadFilter(void) const;
451 
452     CTempString GetCIGAR(void) const;
453     // returns long form of CIGAR
454     CTempString GetCIGARLong(void) const;
455     // GetMismatchRead() returns difference of the short read and reference
456     // sequence. Matching bases are represented as '='.
457     // The short read is reversed to match direction of the reference seq.
458     CTempString GetMismatchRead(void) const;
459     // GetMismatchRaw() returns only mismatched and inserted/mismatched bases.
460     CTempString GetMismatchRaw(void) const;
461     // MakeFullMismatch() generates all mismatched and all inserted bases.
462     void MakeFullMismatch(string& str,
463                           CTempString cigar,
464                           CTempString mismatch) const;
465 
GetRefSeq_id(void) const466     CRef<CSeq_id> GetRefSeq_id(void) const {
467         return m_RefIter->GetMainSeq_id();
468     }
469     CRef<CSeq_id> GetShortSeq_id(void) const;
470     CRef<CSeq_id> GetMateShortSeq_id(void) const;
471     CRef<CBioseq> GetShortBioseq(void) const;
472     CRef<CSeq_align> GetMatchAlign(void) const;
473     CRef<CSeq_graph> GetQualityGraph(void) const;
474     CRef<CSeq_annot> GetEmptyMatchAnnot(void) const;
475     CRef<CSeq_annot> GetEmptyMatchAnnot(const string& annot_name) const;
476     CRef<CSeq_annot> GetMatchAnnot(void) const;
477     CRef<CSeq_annot> GetMatchAnnot(const string& annot_name) const;
478     CRef<CSeq_annot> GetQualityGraphAnnot(void) const;
479     CRef<CSeq_annot> GetQualityGraphAnnot(const string& annot_name) const;
480     CRef<CSeq_entry> GetMatchEntry(void) const;
481     CRef<CSeq_entry> GetMatchEntry(const string& annot_name) const;
482     CRef<CSeq_annot> GetSeq_annot(void) const;
483     CRef<CSeq_annot> GetSeq_annot(const string& annot_name) const;
484 
485     static CRef<CSeq_annot> MakeSeq_annot(const string& annot_name);
486     static CRef<CSeq_annot> MakeEmptyMatchAnnot(const string& annot_name);
487     static CRef<CAnnotdesc> MakeMatchAnnotIndicator(void);
488 
489 protected:
490     friend class CCSraShortReadIterator;
491 
GetDb(void) const492     CCSraDb_Impl& GetDb(void) const {
493         return m_RefIter.GetDb();
494     }
495 
496     CCSraAlignIterator(const CCSraDb& csra_db,
497                        TAlignType align_type,
498                        TVDBRowId align_row);
499 
500     void x_Settle(void); // skip all non-matching elements
x_Next(void)501     void x_Next(void) {
502         ++m_AlnRowCur;
503         x_Settle();
504     }
505 
506     CRef<CSeq_entry> x_GetMatchEntry(const string* annot_name) const;
507     CRef<CSeq_annot> x_GetEmptyMatchAnnot(const string* annot_name) const;
508     CRef<CSeq_annot> x_GetMatchAnnot(const string* annot_name) const;
509     CRef<CSeq_annot> x_GetQualityGraphAnnot(const string* annot_name) const;
510     CRef<CSeq_annot> x_GetSeq_annot(const string* annot_name) const;
511 
512     typedef CRef<CObject_id> TObjectIdCache;
513     typedef map<CTempString, CRef<CUser_field> > TUserFieldCache;
514     CRef<CUser_object> x_GetSecondaryIndicator(void) const;
515     CObject_id& x_GetObject_id(const char* name, TObjectIdCache& cache) const;
516     CUser_field& x_AddField(CUser_object& obj,
517                             const char* name,
518                             TObjectIdCache& cache) const;
519     void x_AddField(CUser_object& obj, const char* name, CTempString value,
520                     TObjectIdCache& cache) const;
521     void x_AddField(CUser_object& obj, const char* name, int value,
522                     TObjectIdCache& cache) const;
523     void x_AddField(CUser_object& obj, const char* name, CTempString value,
524                     TObjectIdCache& id_cache, TUserFieldCache& cache,
525                     size_t max_value_length, size_t max_cache_size) const;
526 
527 private:
528     CCSraRefSeqIterator m_RefIter; // refseq selector
529     CRef<CCSraDb_Impl::SRefTableCursor> m_Ref; // VDB ref table accessor
530     CRef<CCSraDb_Impl::SAlnTableCursor> m_Aln; // VDB align table accessor
531 
532     rc_t m_Error; // result of VDB access
533     TSeqPos m_ArgRefPos, m_ArgRefLast; // requested refseq range
534     TSeqPos m_CurRefPos, m_CurRefLen; // current alignment refseq range
535 
536     TVDBRowId m_RefRowNext; // refseq row range
537     TVDBRowId m_RefRowLast;
538     bool m_AlnRowIsSecondary;
539     ESearchMode m_SearchMode;
540     TAlignType m_AlignType;
541     const TVDBRowId* m_AlnRowCur; // current refseq row alignments ids
542     const TVDBRowId* m_AlnRowEnd;
543 
544     struct SCreateCache {
545         CRef<CAnnotdesc> m_MatchAnnotIndicator;
546 
547         TObjectIdCache m_ObjectIdMateRead;
548         TObjectIdCache m_ObjectIdRefId;
549         TObjectIdCache m_ObjectIdRefPos;
550         TObjectIdCache m_ObjectIdLcl;
551         TObjectIdCache m_ObjectIdTracebacks;
552         TObjectIdCache m_ObjectIdCIGAR;
553         TObjectIdCache m_ObjectIdMISMATCH;
554         TUserFieldCache m_UserFieldCacheCigar;
555         TUserFieldCache m_UserFieldCacheMismatch;
556         CRef<CUser_object> m_SecondaryIndicator;
557         CRef<CUser_object> m_ReadFilterIndicator[4];
558     };
559     mutable AutoPtr<SCreateCache> m_CreateCache;
560 
561     SCreateCache& x_GetCreateCache(void) const;
562 };
563 
564 
565 class NCBI_SRAREAD_EXPORT CCSraShortReadIterator : public SCSraDb_Defs
566 {
567 public:
568     enum EClipType {
569         eDefaultClip,  // as defined by config
570         eNoClip,       // force no clipping
571         eClipByQuality // force clipping
572     };
573 
574     CCSraShortReadIterator(void);
575     explicit
576     CCSraShortReadIterator(const CCSraDb& csra_db,
577                            EClipType clip_type = eDefaultClip);
578     // The last constructor parameter was changed from zero-based mate_index
579     // to one-based read_id to reflect standardization of short read ids
580     // in form gnl|SRA|<SRA accesion>.<Spot id>.<Read id>.
581     CCSraShortReadIterator(const CCSraDb& csra_db,
582                            TVDBRowId spot_id,
583                            EClipType clip_type = eDefaultClip);
584     CCSraShortReadIterator(const CCSraDb& csra_db,
585                            TVDBRowId spot_id,
586                            uint32_t read_id,
587                            EClipType clip_type = eDefaultClip);
588     ~CCSraShortReadIterator(void);
589 
590     void Reset(void);
591     CCSraShortReadIterator(const CCSraShortReadIterator& iter);
592     CCSraShortReadIterator& operator=(const CCSraShortReadIterator& iter);
593 
594     bool Select(TVDBRowId spot_id);
595     bool Select(TVDBRowId spot_id, uint32_t read_id);
596     void SetLastSpotId(TVDBRowId spot_id);
597 
operator const void*(void) const598     operator const void*(void) const {
599         return m_Error? 0: this;
600     }
operator !(void) const601     bool operator!(void) const {
602         return m_Error != 0;
603     }
604 
operator ++(void)605     CCSraShortReadIterator& operator++(void) {
606         x_Next();
607         return *this;
608     }
609 
GetSpotId(void) const610     TVDBRowId GetSpotId(void) const {
611         return m_SpotId;
612     }
GetMaxSpotId(void) const613     TVDBRowId GetMaxSpotId(void) const {
614         return m_MaxSpotId;
615     }
GetReadId(void) const616     uint32_t GetReadId(void) const {
617         return m_ReadId;
618     }
GetMaxReadId(void) const619     uint32_t GetMaxReadId(void) const {
620         return m_MaxReadId;
621     }
622 
623     // Use GetReadId() instead of GetMateIndex().
624     // Note that GetReadId() is one-based and GetMateIndex() is zero-based.
GetMateIndex(void) const625     NCBI_DEPRECATED uint32_t GetMateIndex(void) const {
626         return GetReadId()-1;
627     }
628     // Number of reads in this spot.
GetReadCount(void) const629     uint32_t GetReadCount(void) const {
630         return GetMaxReadId();
631     }
632     // Number of biological reads without taking into account any clipping.
633     uint32_t GetMateCount(void) const;
634 
635     CTempString GetSpotGroup(void) const;
636 
637     bool IsSetName(void) const;
638     CTempString GetName(void) const;
639 
640     // Returns true if current read has clipping info that can or does
641     // reduce sequence length.
642     bool HasClippingInfo(void) const;
643     // Returns true if current read is actually clipped by quality.
644     // It can be true only if clipping by quality is on.
IsClippedByQuality(void) const645     bool IsClippedByQuality(void) const {
646         return m_ClipByQuality && HasClippingInfo();
647     }
648     // Returns true if current read has actual clipping info that is not
649     // applied because clipping by quality is off.
ShouldBeClippedByQuality(void) const650     bool ShouldBeClippedByQuality(void) const {
651         return !m_ClipByQuality && HasClippingInfo();
652     }
653 
654     CTempString GetReadData(EClipType clip_type = eDefaultClip) const;
655 
GetShortId1(void) const656     TVDBRowId GetShortId1(void) const {
657         return GetSpotId();
658     }
GetShortId2(void) const659     uint32_t GetShortId2(void) const {
660         return GetReadId();
661     }
662 
663     bool IsTechnicalRead(void) const;
664 
665     INSDC_read_filter GetReadFilter(void) const;
666 
667     // returns current read range inside spot
668     typedef COpenRange<TSeqPos> TOpenRange;
669     TOpenRange GetReadRange(EClipType clip_type = eDefaultClip) const;
670 
GetShortLen(void) const671     TSeqPos GetShortLen(void) const {
672         return GetReadRange().GetLength();
673     }
674 
675     CRef<CSeq_id> GetShortSeq_id(void) const;
676 
677     // clip coordinate (inclusive)
678     TSeqPos GetClipQualityLeft(void) const;
679     TSeqPos GetClipQualityLength(void) const;
GetClipQualityRight(void) const680     TSeqPos GetClipQualityRight(void) const
681         {
682             // inclusive
683             return GetClipQualityLeft() + GetClipQualityLength() - 1;
684         }
685 
686     CRef<CSeq_graph> GetQualityGraph(void) const;
687     CRef<CSeq_annot> GetQualityGraphAnnot(void) const;
688     CRef<CSeq_annot> GetQualityGraphAnnot(const string& annot_name) const;
689 
690     enum EBioseqFlags {
691         fQualityGraph = 1<<0,
692         fDefaultBioseqFlags = 0
693     };
694     typedef int TBioseqFlags;
695     CRef<CBioseq> GetShortBioseq(TBioseqFlags flags = fDefaultBioseqFlags) const;
696 
697     CCSraRefSeqIterator GetRefSeqIter(TSeqPos* ref_pos_ptr = NULL) const;
698     CCSraAlignIterator GetAlignIter() const;
699 
700 protected:
GetDb(void) const701     CCSraDb_Impl& GetDb(void) const {
702         return m_Db.GetNCObject();
703     }
704 
705     void x_Init(const CCSraDb& csra_db, EClipType clip_type);
706     bool x_ValidRead(void) const;
707     bool x_Settle(bool single_spot = false);
x_Next(void)708     bool x_Next(void) {
709         ++m_ReadId;
710         return x_Settle();
711     }
712 
713 
714     void x_GetMaxReadId(void);
715     CTempString x_GetReadData(TOpenRange range) const;
716 
717     CRef<CSeq_annot> x_GetSeq_annot(const string* annot_name) const;
718     CRef<CSeq_annot> x_GetQualityGraphAnnot(const string* annot_name) const;
719     CRef<CSeq_annot> x_GetQualityGraphAnnot(TOpenRange range,
720                                             const string* annot_name) const;
721     CRef<CSeq_graph> x_GetQualityGraph(TOpenRange range) const;
722 
723 private:
724     CCSraDb m_Db; // refseq selector
725     CRef<CCSraDb_Impl::SSeqTableCursor> m_Seq; // VDB sequence table accessor
726 
727     TVDBRowId m_SpotId;
728     TVDBRowId m_MaxSpotId;
729     uint32_t m_ReadId;
730     uint32_t m_MaxReadId;
731     bool m_IncludeTechnicalReads;
732     bool m_ClipByQuality;
733 
734     rc_t m_Error; // result of VDB access
735 };
736 
737 
738 /////////////////////////////////////////////////////////////////////////////
739 // CCSraRefSeqIterator
740 
741 inline
742 CRef<CSeq_annot>
GetSeq_annot(const string & annot_name) const743 CCSraRefSeqIterator::GetSeq_annot(const string& annot_name) const
744 {
745     return x_GetSeq_annot(&annot_name);
746 }
747 
748 
749 inline
750 CRef<CSeq_annot>
GetSeq_annot(void) const751 CCSraRefSeqIterator::GetSeq_annot(void) const
752 {
753     return x_GetSeq_annot(0);
754 }
755 
756 
757 /////////////////////////////////////////////////////////////////////////////
758 // CCSraAlignIterator
759 
760 inline
761 CRef<CSeq_entry>
GetMatchEntry(const string & annot_name) const762 CCSraAlignIterator::GetMatchEntry(const string& annot_name) const
763 {
764     return x_GetMatchEntry(&annot_name);
765 }
766 
767 
768 inline
769 CRef<CSeq_entry>
GetMatchEntry(void) const770 CCSraAlignIterator::GetMatchEntry(void) const
771 {
772     return x_GetMatchEntry(0);
773 }
774 
775 
776 inline
777 CRef<CSeq_annot>
GetEmptyMatchAnnot(const string & annot_name) const778 CCSraAlignIterator::GetEmptyMatchAnnot(const string& annot_name) const
779 {
780     return x_GetEmptyMatchAnnot(&annot_name);
781 }
782 
783 
784 inline
785 CRef<CSeq_annot>
GetEmptyMatchAnnot(void) const786 CCSraAlignIterator::GetEmptyMatchAnnot(void) const
787 {
788     return x_GetEmptyMatchAnnot(0);
789 }
790 
791 
792 inline
793 CRef<CSeq_annot>
GetMatchAnnot(const string & annot_name) const794 CCSraAlignIterator::GetMatchAnnot(const string& annot_name) const
795 {
796     return x_GetMatchAnnot(&annot_name);
797 }
798 
799 
800 inline
801 CRef<CSeq_annot>
GetMatchAnnot(void) const802 CCSraAlignIterator::GetMatchAnnot(void) const
803 {
804     return x_GetMatchAnnot(0);
805 }
806 
807 
808 inline
809 CRef<CSeq_annot>
GetQualityGraphAnnot(const string & annot_name) const810 CCSraAlignIterator::GetQualityGraphAnnot(const string& annot_name) const
811 {
812     return x_GetQualityGraphAnnot(&annot_name);
813 }
814 
815 
816 inline
817 CRef<CSeq_annot>
GetQualityGraphAnnot(void) const818 CCSraAlignIterator::GetQualityGraphAnnot(void) const
819 {
820     return x_GetQualityGraphAnnot(0);
821 }
822 
823 
824 inline
825 CRef<CSeq_annot>
x_GetSeq_annot(const string * annot_name) const826 CCSraAlignIterator::x_GetSeq_annot(const string* annot_name) const
827 {
828     return m_RefIter.x_GetSeq_annot(annot_name);
829 }
830 
831 
832 inline
833 CRef<CSeq_annot>
GetSeq_annot(const string & annot_name) const834 CCSraAlignIterator::GetSeq_annot(const string& annot_name) const
835 {
836     return x_GetSeq_annot(&annot_name);
837 }
838 
839 
840 inline
841 CRef<CSeq_annot>
GetSeq_annot(void) const842 CCSraAlignIterator::GetSeq_annot(void) const
843 {
844     return x_GetSeq_annot(0);
845 }
846 
847 
848 inline
849 CRef<CSeq_annot>
MakeSeq_annot(const string & annot_name)850 CCSraAlignIterator::MakeSeq_annot(const string& annot_name)
851 {
852     return CCSraRefSeqIterator::MakeSeq_annot(annot_name);
853 }
854 
855 
856 /////////////////////////////////////////////////////////////////////////////
857 // CCSraShortReadIterator
858 
859 inline
860 CRef<CSeq_annot>
GetQualityGraphAnnot(const string & annot_name) const861 CCSraShortReadIterator::GetQualityGraphAnnot(const string& annot_name) const
862 {
863     return x_GetQualityGraphAnnot(&annot_name);
864 }
865 
866 
867 inline
868 CRef<CSeq_annot>
GetQualityGraphAnnot(void) const869 CCSraShortReadIterator::GetQualityGraphAnnot(void) const
870 {
871     return x_GetQualityGraphAnnot(0);
872 }
873 
874 
875 END_NAMESPACE(objects);
876 END_NCBI_NAMESPACE;
877 
878 #endif // SRA__READER__SRA__CSRAREAD__HPP
879