1 #ifndef SRA__READER__SRA__CSRAREAD__HPP
2 #define SRA__READER__SRA__CSRAREAD__HPP
3 /* $Id: csraread.hpp 593794 2019-09-24 17:56:58Z vasilche $
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * Authors: Eugene Vasilchenko
29 *
30 * File Description:
31 * Access to cSRA files
32 *
33 */
34
35 #include <corelib/ncbistd.hpp>
36 #include <corelib/ncbiobj.hpp>
37 #include <corelib/ncbimtx.hpp>
38 #include <util/range.hpp>
39 #include <sra/readers/sra/vdbread.hpp>
40 #include <objects/seq/seq_id_handle.hpp>
41 #include <objects/seq/Annotdesc.hpp>
42 #include <objects/seq/Seq_literal.hpp>
43 #include <map>
44 #include <list>
45
46 //#include <insdc/sra.h> // for INSDC_coord_one, INSDC_coord_len, INSDC_read_filter
47 typedef int32_t INSDC_coord_one;
48 typedef uint32_t INSDC_coord_len;
49 typedef uint8_t INSDC_read_filter;
50
51 BEGIN_NCBI_NAMESPACE;
52 BEGIN_NAMESPACE(objects);
53
54 class CCSraRefSeqIterator;
55 class CCSraAlignIterator;
56 class CCSraShortReadIterator;
57 class CSeq_entry;
58 class CSeq_annot;
59 class CSeq_align;
60 class CSeq_graph;
61 class CBioseq;
62 class CUser_object;
63 class CUser_field;
64 class IIdMapper;
65
66 struct SCSraDb_Defs
67 {
68 enum ERefIdType {
69 eRefId_SEQ_ID,
70 eRefId_gnl_NAME
71 };
72 enum EPathInIdType {
73 ePathInId_config,
74 ePathInId_yes,
75 ePathInId_no
76 };
77 enum EAlignType {
78 fPrimaryAlign = 1<<0,
79 fSecondaryAlign = 1<<1,
80 fAnyAlign = fPrimaryAlign | fSecondaryAlign
81 };
82 typedef EAlignType TAlignType;
83 };
84
85
86 class NCBI_SRAREAD_EXPORT CCSraDb_Impl : public CObject, public SCSraDb_Defs
87 {
88 public:
89 CCSraDb_Impl(CVDBMgr& mgr, const string& csra_path,
90 IIdMapper* ref_id_mapper,
91 ERefIdType ref_id_type,
92 const string& sra_id_part);
93 virtual ~CCSraDb_Impl(void);
94
95 // SRefInfo holds cached refseq information - ids, len, rows
96 struct SRefInfo {
97 string m_Name, m_SeqId;
98 mutable volatile TSeqPos m_SeqLength; // actual len will be updated
99 CBioseq::TId m_Seq_ids;
100 CSeq_id_Handle m_Seq_id_Handle;
GetMainSeq_idCCSraDb_Impl::SRefInfo101 CRef<CSeq_id>& GetMainSeq_id(void) {
102 return m_Seq_ids.front();
103 }
GetMainSeq_idCCSraDb_Impl::SRefInfo104 const CRef<CSeq_id>& GetMainSeq_id(void) const {
105 return m_Seq_ids.front();
106 }
GetMainSeq_id_HandleCCSraDb_Impl::SRefInfo107 const CSeq_id_Handle& GetMainSeq_id_Handle(void) const {
108 return m_Seq_id_Handle;
109 }
110 TVDBRowId m_RowFirst, m_RowLast;
111 bool m_Circular;
112 vector<TSeqPos> m_AlnOverStarts; // relative to m_RowFirst
113 };
114 typedef list<SRefInfo> TRefInfoList;
115 typedef map<string, TRefInfoList::iterator, PNocase> TRefInfoMapByName;
116 typedef map<CSeq_id_Handle, TRefInfoList::iterator> TRefInfoMapBySeq_id;
117
GetRefInfoList(void) const118 const TRefInfoList& GetRefInfoList(void) const {
119 return m_RefList;
120 }
GetRefInfoMapByName(void) const121 const TRefInfoMapByName& GetRefInfoMapByName(void) const {
122 return m_RefMapByName;
123 }
GetRefInfoMapBySeq_id(void) const124 const TRefInfoMapBySeq_id& GetRefInfoMapBySeq_id(void) const {
125 return m_RefMapBySeq_id;
126 }
127
GetRowSize(void) const128 TSeqPos GetRowSize(void) const {
129 return m_RowSize;
130 }
131
132 typedef vector<string> TSpotGroups;
133 void GetSpotGroups(TSpotGroups& spot_groups);
134
GetCSraPath(void) const135 const string& GetCSraPath(void) const {
136 return m_CSraPath;
137 }
138
GetSraIdPart(void) const139 const string& GetSraIdPart(void) const {
140 return m_SraIdPart;
141 }
SetSraIdPart(const string & s)142 void SetSraIdPart(const string& s) {
143 m_SraIdPart = s;
144 }
145
146 CRef<CSeq_id> MakeShortReadId(TVDBRowId id1, INSDC_coord_one id2) const;
147 void SetShortReadId(string& str, TVDBRowId id1, INSDC_coord_one id2) const;
148
149 protected:
150 friend class CCSraRefSeqIterator;
151 friend class CCSraAlignIterator;
152 friend class CCSraShortReadIterator;
153
154 void x_CalcSeqLength(const SRefInfo& info);
155
156 // SRefTableCursor is helper accessor structure for refseq table
157 struct SRefTableCursor;
158 // SAlnTableCursor is helper accessor structure for align table
159 struct SAlnTableCursor;
160 // SSeqTableCursor is helper accessor structure for sequence table
161 struct SSeqTableCursor;
162 friend struct SSeqTableCursor;
163
164 // get table accessor object for exclusive access
165 CRef<SRefTableCursor> Ref(void);
166 CRef<SAlnTableCursor> Aln(bool is_secondary);
167 CRef<SSeqTableCursor> Seq(void);
168 // return table accessor object for reuse
169 void Put(CRef<SRefTableCursor>& curs);
170 void Put(CRef<SAlnTableCursor>& curs);
171 void Put(CRef<SSeqTableCursor>& curs);
172
173 protected:
174 void OpenRefTable(void);
175 void OpenAlnTable(bool is_secondary);
176 void OpenSeqTable(void);
177
RefTable(void)178 const CVDBTable& RefTable(void) {
179 const CVDBTable& table = m_RefTable;
180 if ( !table ) {
181 OpenRefTable();
182 }
183 return table;
184 }
AlnTable(bool is_secondary)185 const CVDBTable& AlnTable(bool is_secondary) {
186 const CVDBTable& table = m_AlnTable[is_secondary];
187 if ( !table ) {
188 OpenAlnTable(is_secondary);
189 }
190 return table;
191 }
SeqTable(void)192 const CVDBTable& SeqTable(void) {
193 const CVDBTable& table = m_SeqTable;
194 if ( !table ) {
195 OpenSeqTable();
196 }
197 return table;
198 }
199
200 void x_MakeRefSeq_ids(SRefInfo& info,
201 IIdMapper* ref_id_mapper,
202 int ref_id_type);
203
204 private:
205 CVDBMgr m_Mgr;
206 CVDB m_Db;
207 string m_CSraPath;
208 string m_SraIdPart;
209
210 CFastMutex m_TableMutex;
211 CFastMutex m_OverlapMutex;
212 CVDBTable m_RefTable;
213 CVDBTable m_AlnTable[2];
214 CVDBTable m_SeqTable;
215
216 CVDBObjectCache<SRefTableCursor> m_Ref;
217 CVDBObjectCache<SAlnTableCursor> m_Aln[2];
218 CVDBObjectCache<SSeqTableCursor> m_Seq;
219
220 TSeqPos m_RowSize; // cached size of refseq row in bases
221 TRefInfoList m_RefList; // list of cached refseqs' information
222 TRefInfoMapByName m_RefMapByName; // index for refseq info lookup
223 TRefInfoMapBySeq_id m_RefMapBySeq_id; // index for refseq info lookup
224 };
225
226
227 class CCSraDb : public CRef<CCSraDb_Impl>, public SCSraDb_Defs
228 {
229 public:
CCSraDb(void)230 CCSraDb(void)
231 {
232 }
233 NCBI_SRAREAD_EXPORT
234 CCSraDb(CVDBMgr& mgr,
235 const string& csra_path,
236 IIdMapper* ref_id_mapper = NULL,
237 ERefIdType ref_id_type = eRefId_SEQ_ID);
238 NCBI_SRAREAD_EXPORT
239 CCSraDb(CVDBMgr& mgr,
240 const string& csra_path,
241 const string& sra_id_part,
242 IIdMapper* ref_id_mapper = NULL,
243 ERefIdType ref_id_type = eRefId_SEQ_ID);
244
245 NCBI_SRAREAD_EXPORT
246 static string MakeSraIdPart(EPathInIdType path_in_id_type,
247 const string& dir_path,
248 const string& csra_file);
249
GetRowSize(void) const250 TSeqPos GetRowSize(void) const
251 {
252 return GetObject().GetRowSize();
253 }
254
255 typedef CCSraDb_Impl::TSpotGroups TSpotGroups;
GetSpotGroups(TSpotGroups & spot_groups)256 void GetSpotGroups(TSpotGroups& spot_groups)
257 {
258 GetObject().GetSpotGroups(spot_groups);
259 }
260 };
261
262
263 class NCBI_SRAREAD_EXPORT CCSraRefSeqIterator : public SCSraDb_Defs
264 {
265 public:
CCSraRefSeqIterator(void)266 CCSraRefSeqIterator(void)
267 {
268 }
CCSraRefSeqIterator(const CCSraDb & csra_db,CCSraDb_Impl::TRefInfoList::const_iterator iter)269 CCSraRefSeqIterator(const CCSraDb& csra_db,
270 CCSraDb_Impl::TRefInfoList::const_iterator iter)
271 : m_Db(csra_db),
272 m_Iter(iter)
273 {
274 }
275 explicit CCSraRefSeqIterator(const CCSraDb& csra_db);
276 NCBI_DEPRECATED
277 CCSraRefSeqIterator(const CCSraDb& csra_db, const string& seq_id);
278 enum EByName {
279 eByName
280 };
281 CCSraRefSeqIterator(const CCSraDb& csra_db, const string& name,
282 EByName /*by_name*/);
283 CCSraRefSeqIterator(const CCSraDb& csra_db, const CSeq_id_Handle& seq_id);
284
operator !(void) const285 bool operator!(void) const {
286 return !m_Db || m_Iter == m_Db->GetRefInfoList().end();
287 }
operator const void*(void) const288 operator const void*(void) const {
289 return !*this? 0: this;
290 }
291
292 const CCSraDb_Impl::SRefInfo& GetInfo(void) const;
operator *(void) const293 const CCSraDb_Impl::SRefInfo& operator*(void) const {
294 return GetInfo();
295 }
operator ->(void) const296 const CCSraDb_Impl::SRefInfo* operator->(void) const {
297 return &GetInfo();
298 }
299
operator ++(void)300 CCSraRefSeqIterator& operator++(void) {
301 ++m_Iter;
302 return *this;
303 }
304
GetRefSeqId(void) const305 const string& GetRefSeqId(void) const {
306 return m_Iter->m_SeqId;
307 }
GetRefSeq_id(void) const308 CRef<CSeq_id> GetRefSeq_id(void) const {
309 return m_Iter->GetMainSeq_id();
310 }
GetRefSeq_id_Handle(void) const311 const CSeq_id_Handle& GetRefSeq_id_Handle(void) const {
312 return m_Iter->GetMainSeq_id_Handle();
313 }
GetRefSeq_ids(void) const314 const CBioseq::TId& GetRefSeq_ids(void) const {
315 return m_Iter->m_Seq_ids;
316 }
317
318 bool IsCircular(void) const;
319
320 TSeqPos GetSeqLength(void) const;
321
322 NCBI_DEPRECATED
323 size_t GetRowAlignCount(TVDBRowId row) const;
324
325 size_t GetAlignCountAtPos(TSeqPos pos, TAlignType type = fAnyAlign) const;
326
327 CRef<CSeq_graph> GetCoverageGraph(void) const;
328 CRef<CSeq_annot> GetCoverageAnnot(void) const;
329 CRef<CSeq_annot> GetCoverageAnnot(const string& annot_name) const;
330
331 CRef<CSeq_annot> GetSeq_annot(void) const;
332 CRef<CSeq_annot> GetSeq_annot(const string& annot_name) const;
333
334 enum ELoadData {
335 eLoadData,
336 eOmitData
337 };
338 CRef<CBioseq> GetRefBioseq(ELoadData load = eLoadData) const;
339 typedef list< CRef<CSeq_literal> > TLiterals;
340 typedef CRange<TSeqPos> TRange;
341 void GetRefLiterals(TLiterals& literals,
342 TRange range,
343 ELoadData load = eLoadData) const;
344
345 // return array of start position of alignmnets overlapping with each page
346 // return empty array if at most one page overlapping is allowed
347 const vector<TSeqPos>& GetAlnOverStarts(void) const;
348 // return first position that is surely not covered by alignments
349 // with starting position in the argument range
350 TSeqPos GetAlnOverToOpen(TRange range) const;
351
352 // estimate number of alignments mapped to the reference sequence
353 Uint8 GetEstimatedNumberOfAlignments(void) const;
354
355 protected:
356 friend class CCSraAlignIterator;
357
358 CRef<CSeq_annot> x_GetSeq_annot(const string* annot_name) const;
359
360 static CRef<CSeq_annot> MakeSeq_annot(const string& annot_name);
361
GetDb(void) const362 CCSraDb_Impl& GetDb(void) const {
363 return m_Db.GetNCObject();
364 }
365
366 private:
367 CCSraDb m_Db;
368 CCSraDb_Impl::TRefInfoList::const_iterator m_Iter;
369 };
370
371
372 class NCBI_SRAREAD_EXPORT CCSraAlignIterator : public SCSraDb_Defs
373 {
374 public:
375 CCSraAlignIterator(void);
376
377 enum ESearchMode {
378 eSearchByOverlap,
379 eSearchByStart
380 };
381
382 NCBI_DEPRECATED
383 CCSraAlignIterator(const CCSraDb& csra_db,
384 const string& ref_id,
385 TSeqPos ref_pos,
386 TSeqPos window = 0,
387 ESearchMode search_mode = eSearchByOverlap,
388 TAlignType align_type = fAnyAlign);
389 CCSraAlignIterator(const CCSraDb& csra_db,
390 const CSeq_id_Handle& ref_id,
391 TSeqPos ref_pos,
392 TSeqPos window,
393 ESearchMode search_mode,
394 TAlignType align_type = fAnyAlign);
395 CCSraAlignIterator(const CCSraDb& csra_db,
396 const CSeq_id_Handle& ref_id,
397 TSeqPos ref_pos,
398 TSeqPos window = 0,
399 TAlignType align_type = fAnyAlign);
400 ~CCSraAlignIterator(void);
401
402 void Reset(void);
403 CCSraAlignIterator(const CCSraAlignIterator& iter);
404 CCSraAlignIterator& operator=(const CCSraAlignIterator& iter);
405
406 void Select(TSeqPos ref_pos,
407 TSeqPos window = 0,
408 ESearchMode search_mode = eSearchByOverlap,
409 TAlignType align_type = fAnyAlign);
410
operator const void*(void) const411 operator const void*(void) const {
412 return m_Error? 0: this;
413 }
operator !(void) const414 bool operator!(void) const {
415 return m_Error != 0;
416 }
417
operator ++(void)418 CCSraAlignIterator& operator++(void) {
419 x_Next();
420 return *this;
421 }
422
423 TVDBRowId GetAlignmentId(void) const;
424
IsSecondary(void) const425 bool IsSecondary(void) const {
426 return m_AlnRowIsSecondary;
427 }
428
429 CTempString GetRefSeqId(void) const;
GetRefSeqPos(void) const430 TSeqPos GetRefSeqPos(void) const {
431 return m_CurRefPos;
432 }
GetRefSeqLen(void) const433 TSeqPos GetRefSeqLen(void) const {
434 return m_CurRefLen;
435 }
436 bool GetRefMinusStrand(void) const;
437
438 int GetMapQuality(void) const;
439
440 TVDBRowId GetShortId1(void) const;
441 INSDC_coord_one GetShortId2(void) const;
442 TSeqPos GetShortPos(void) const;
443 TSeqPos GetShortLen(void) const;
444
445 CTempString GetSpotGroup(void) const;
446
447 bool IsSetName(void) const;
448 CTempString GetName(void) const;
449
450 INSDC_read_filter GetReadFilter(void) const;
451
452 CTempString GetCIGAR(void) const;
453 // returns long form of CIGAR
454 CTempString GetCIGARLong(void) const;
455 // GetMismatchRead() returns difference of the short read and reference
456 // sequence. Matching bases are represented as '='.
457 // The short read is reversed to match direction of the reference seq.
458 CTempString GetMismatchRead(void) const;
459 // GetMismatchRaw() returns only mismatched and inserted/mismatched bases.
460 CTempString GetMismatchRaw(void) const;
461 // MakeFullMismatch() generates all mismatched and all inserted bases.
462 void MakeFullMismatch(string& str,
463 CTempString cigar,
464 CTempString mismatch) const;
465
GetRefSeq_id(void) const466 CRef<CSeq_id> GetRefSeq_id(void) const {
467 return m_RefIter->GetMainSeq_id();
468 }
469 CRef<CSeq_id> GetShortSeq_id(void) const;
470 CRef<CSeq_id> GetMateShortSeq_id(void) const;
471 CRef<CBioseq> GetShortBioseq(void) const;
472 CRef<CSeq_align> GetMatchAlign(void) const;
473 CRef<CSeq_graph> GetQualityGraph(void) const;
474 CRef<CSeq_annot> GetEmptyMatchAnnot(void) const;
475 CRef<CSeq_annot> GetEmptyMatchAnnot(const string& annot_name) const;
476 CRef<CSeq_annot> GetMatchAnnot(void) const;
477 CRef<CSeq_annot> GetMatchAnnot(const string& annot_name) const;
478 CRef<CSeq_annot> GetQualityGraphAnnot(void) const;
479 CRef<CSeq_annot> GetQualityGraphAnnot(const string& annot_name) const;
480 CRef<CSeq_entry> GetMatchEntry(void) const;
481 CRef<CSeq_entry> GetMatchEntry(const string& annot_name) const;
482 CRef<CSeq_annot> GetSeq_annot(void) const;
483 CRef<CSeq_annot> GetSeq_annot(const string& annot_name) const;
484
485 static CRef<CSeq_annot> MakeSeq_annot(const string& annot_name);
486 static CRef<CSeq_annot> MakeEmptyMatchAnnot(const string& annot_name);
487 static CRef<CAnnotdesc> MakeMatchAnnotIndicator(void);
488
489 protected:
490 friend class CCSraShortReadIterator;
491
GetDb(void) const492 CCSraDb_Impl& GetDb(void) const {
493 return m_RefIter.GetDb();
494 }
495
496 CCSraAlignIterator(const CCSraDb& csra_db,
497 TAlignType align_type,
498 TVDBRowId align_row);
499
500 void x_Settle(void); // skip all non-matching elements
x_Next(void)501 void x_Next(void) {
502 ++m_AlnRowCur;
503 x_Settle();
504 }
505
506 CRef<CSeq_entry> x_GetMatchEntry(const string* annot_name) const;
507 CRef<CSeq_annot> x_GetEmptyMatchAnnot(const string* annot_name) const;
508 CRef<CSeq_annot> x_GetMatchAnnot(const string* annot_name) const;
509 CRef<CSeq_annot> x_GetQualityGraphAnnot(const string* annot_name) const;
510 CRef<CSeq_annot> x_GetSeq_annot(const string* annot_name) const;
511
512 typedef CRef<CObject_id> TObjectIdCache;
513 typedef map<CTempString, CRef<CUser_field> > TUserFieldCache;
514 CRef<CUser_object> x_GetSecondaryIndicator(void) const;
515 CObject_id& x_GetObject_id(const char* name, TObjectIdCache& cache) const;
516 CUser_field& x_AddField(CUser_object& obj,
517 const char* name,
518 TObjectIdCache& cache) const;
519 void x_AddField(CUser_object& obj, const char* name, CTempString value,
520 TObjectIdCache& cache) const;
521 void x_AddField(CUser_object& obj, const char* name, int value,
522 TObjectIdCache& cache) const;
523 void x_AddField(CUser_object& obj, const char* name, CTempString value,
524 TObjectIdCache& id_cache, TUserFieldCache& cache,
525 size_t max_value_length, size_t max_cache_size) const;
526
527 private:
528 CCSraRefSeqIterator m_RefIter; // refseq selector
529 CRef<CCSraDb_Impl::SRefTableCursor> m_Ref; // VDB ref table accessor
530 CRef<CCSraDb_Impl::SAlnTableCursor> m_Aln; // VDB align table accessor
531
532 rc_t m_Error; // result of VDB access
533 TSeqPos m_ArgRefPos, m_ArgRefLast; // requested refseq range
534 TSeqPos m_CurRefPos, m_CurRefLen; // current alignment refseq range
535
536 TVDBRowId m_RefRowNext; // refseq row range
537 TVDBRowId m_RefRowLast;
538 bool m_AlnRowIsSecondary;
539 ESearchMode m_SearchMode;
540 TAlignType m_AlignType;
541 const TVDBRowId* m_AlnRowCur; // current refseq row alignments ids
542 const TVDBRowId* m_AlnRowEnd;
543
544 struct SCreateCache {
545 CRef<CAnnotdesc> m_MatchAnnotIndicator;
546
547 TObjectIdCache m_ObjectIdMateRead;
548 TObjectIdCache m_ObjectIdRefId;
549 TObjectIdCache m_ObjectIdRefPos;
550 TObjectIdCache m_ObjectIdLcl;
551 TObjectIdCache m_ObjectIdTracebacks;
552 TObjectIdCache m_ObjectIdCIGAR;
553 TObjectIdCache m_ObjectIdMISMATCH;
554 TUserFieldCache m_UserFieldCacheCigar;
555 TUserFieldCache m_UserFieldCacheMismatch;
556 CRef<CUser_object> m_SecondaryIndicator;
557 CRef<CUser_object> m_ReadFilterIndicator[4];
558 };
559 mutable AutoPtr<SCreateCache> m_CreateCache;
560
561 SCreateCache& x_GetCreateCache(void) const;
562 };
563
564
565 class NCBI_SRAREAD_EXPORT CCSraShortReadIterator : public SCSraDb_Defs
566 {
567 public:
568 enum EClipType {
569 eDefaultClip, // as defined by config
570 eNoClip, // force no clipping
571 eClipByQuality // force clipping
572 };
573
574 CCSraShortReadIterator(void);
575 explicit
576 CCSraShortReadIterator(const CCSraDb& csra_db,
577 EClipType clip_type = eDefaultClip);
578 // The last constructor parameter was changed from zero-based mate_index
579 // to one-based read_id to reflect standardization of short read ids
580 // in form gnl|SRA|<SRA accesion>.<Spot id>.<Read id>.
581 CCSraShortReadIterator(const CCSraDb& csra_db,
582 TVDBRowId spot_id,
583 EClipType clip_type = eDefaultClip);
584 CCSraShortReadIterator(const CCSraDb& csra_db,
585 TVDBRowId spot_id,
586 uint32_t read_id,
587 EClipType clip_type = eDefaultClip);
588 ~CCSraShortReadIterator(void);
589
590 void Reset(void);
591 CCSraShortReadIterator(const CCSraShortReadIterator& iter);
592 CCSraShortReadIterator& operator=(const CCSraShortReadIterator& iter);
593
594 bool Select(TVDBRowId spot_id);
595 bool Select(TVDBRowId spot_id, uint32_t read_id);
596 void SetLastSpotId(TVDBRowId spot_id);
597
operator const void*(void) const598 operator const void*(void) const {
599 return m_Error? 0: this;
600 }
operator !(void) const601 bool operator!(void) const {
602 return m_Error != 0;
603 }
604
operator ++(void)605 CCSraShortReadIterator& operator++(void) {
606 x_Next();
607 return *this;
608 }
609
GetSpotId(void) const610 TVDBRowId GetSpotId(void) const {
611 return m_SpotId;
612 }
GetMaxSpotId(void) const613 TVDBRowId GetMaxSpotId(void) const {
614 return m_MaxSpotId;
615 }
GetReadId(void) const616 uint32_t GetReadId(void) const {
617 return m_ReadId;
618 }
GetMaxReadId(void) const619 uint32_t GetMaxReadId(void) const {
620 return m_MaxReadId;
621 }
622
623 // Use GetReadId() instead of GetMateIndex().
624 // Note that GetReadId() is one-based and GetMateIndex() is zero-based.
GetMateIndex(void) const625 NCBI_DEPRECATED uint32_t GetMateIndex(void) const {
626 return GetReadId()-1;
627 }
628 // Number of reads in this spot.
GetReadCount(void) const629 uint32_t GetReadCount(void) const {
630 return GetMaxReadId();
631 }
632 // Number of biological reads without taking into account any clipping.
633 uint32_t GetMateCount(void) const;
634
635 CTempString GetSpotGroup(void) const;
636
637 bool IsSetName(void) const;
638 CTempString GetName(void) const;
639
640 // Returns true if current read has clipping info that can or does
641 // reduce sequence length.
642 bool HasClippingInfo(void) const;
643 // Returns true if current read is actually clipped by quality.
644 // It can be true only if clipping by quality is on.
IsClippedByQuality(void) const645 bool IsClippedByQuality(void) const {
646 return m_ClipByQuality && HasClippingInfo();
647 }
648 // Returns true if current read has actual clipping info that is not
649 // applied because clipping by quality is off.
ShouldBeClippedByQuality(void) const650 bool ShouldBeClippedByQuality(void) const {
651 return !m_ClipByQuality && HasClippingInfo();
652 }
653
654 CTempString GetReadData(EClipType clip_type = eDefaultClip) const;
655
GetShortId1(void) const656 TVDBRowId GetShortId1(void) const {
657 return GetSpotId();
658 }
GetShortId2(void) const659 uint32_t GetShortId2(void) const {
660 return GetReadId();
661 }
662
663 bool IsTechnicalRead(void) const;
664
665 INSDC_read_filter GetReadFilter(void) const;
666
667 // returns current read range inside spot
668 typedef COpenRange<TSeqPos> TOpenRange;
669 TOpenRange GetReadRange(EClipType clip_type = eDefaultClip) const;
670
GetShortLen(void) const671 TSeqPos GetShortLen(void) const {
672 return GetReadRange().GetLength();
673 }
674
675 CRef<CSeq_id> GetShortSeq_id(void) const;
676
677 // clip coordinate (inclusive)
678 TSeqPos GetClipQualityLeft(void) const;
679 TSeqPos GetClipQualityLength(void) const;
GetClipQualityRight(void) const680 TSeqPos GetClipQualityRight(void) const
681 {
682 // inclusive
683 return GetClipQualityLeft() + GetClipQualityLength() - 1;
684 }
685
686 CRef<CSeq_graph> GetQualityGraph(void) const;
687 CRef<CSeq_annot> GetQualityGraphAnnot(void) const;
688 CRef<CSeq_annot> GetQualityGraphAnnot(const string& annot_name) const;
689
690 enum EBioseqFlags {
691 fQualityGraph = 1<<0,
692 fDefaultBioseqFlags = 0
693 };
694 typedef int TBioseqFlags;
695 CRef<CBioseq> GetShortBioseq(TBioseqFlags flags = fDefaultBioseqFlags) const;
696
697 CCSraRefSeqIterator GetRefSeqIter(TSeqPos* ref_pos_ptr = NULL) const;
698 CCSraAlignIterator GetAlignIter() const;
699
700 protected:
GetDb(void) const701 CCSraDb_Impl& GetDb(void) const {
702 return m_Db.GetNCObject();
703 }
704
705 void x_Init(const CCSraDb& csra_db, EClipType clip_type);
706 bool x_ValidRead(void) const;
707 bool x_Settle(bool single_spot = false);
x_Next(void)708 bool x_Next(void) {
709 ++m_ReadId;
710 return x_Settle();
711 }
712
713
714 void x_GetMaxReadId(void);
715 CTempString x_GetReadData(TOpenRange range) const;
716
717 CRef<CSeq_annot> x_GetSeq_annot(const string* annot_name) const;
718 CRef<CSeq_annot> x_GetQualityGraphAnnot(const string* annot_name) const;
719 CRef<CSeq_annot> x_GetQualityGraphAnnot(TOpenRange range,
720 const string* annot_name) const;
721 CRef<CSeq_graph> x_GetQualityGraph(TOpenRange range) const;
722
723 private:
724 CCSraDb m_Db; // refseq selector
725 CRef<CCSraDb_Impl::SSeqTableCursor> m_Seq; // VDB sequence table accessor
726
727 TVDBRowId m_SpotId;
728 TVDBRowId m_MaxSpotId;
729 uint32_t m_ReadId;
730 uint32_t m_MaxReadId;
731 bool m_IncludeTechnicalReads;
732 bool m_ClipByQuality;
733
734 rc_t m_Error; // result of VDB access
735 };
736
737
738 /////////////////////////////////////////////////////////////////////////////
739 // CCSraRefSeqIterator
740
741 inline
742 CRef<CSeq_annot>
GetSeq_annot(const string & annot_name) const743 CCSraRefSeqIterator::GetSeq_annot(const string& annot_name) const
744 {
745 return x_GetSeq_annot(&annot_name);
746 }
747
748
749 inline
750 CRef<CSeq_annot>
GetSeq_annot(void) const751 CCSraRefSeqIterator::GetSeq_annot(void) const
752 {
753 return x_GetSeq_annot(0);
754 }
755
756
757 /////////////////////////////////////////////////////////////////////////////
758 // CCSraAlignIterator
759
760 inline
761 CRef<CSeq_entry>
GetMatchEntry(const string & annot_name) const762 CCSraAlignIterator::GetMatchEntry(const string& annot_name) const
763 {
764 return x_GetMatchEntry(&annot_name);
765 }
766
767
768 inline
769 CRef<CSeq_entry>
GetMatchEntry(void) const770 CCSraAlignIterator::GetMatchEntry(void) const
771 {
772 return x_GetMatchEntry(0);
773 }
774
775
776 inline
777 CRef<CSeq_annot>
GetEmptyMatchAnnot(const string & annot_name) const778 CCSraAlignIterator::GetEmptyMatchAnnot(const string& annot_name) const
779 {
780 return x_GetEmptyMatchAnnot(&annot_name);
781 }
782
783
784 inline
785 CRef<CSeq_annot>
GetEmptyMatchAnnot(void) const786 CCSraAlignIterator::GetEmptyMatchAnnot(void) const
787 {
788 return x_GetEmptyMatchAnnot(0);
789 }
790
791
792 inline
793 CRef<CSeq_annot>
GetMatchAnnot(const string & annot_name) const794 CCSraAlignIterator::GetMatchAnnot(const string& annot_name) const
795 {
796 return x_GetMatchAnnot(&annot_name);
797 }
798
799
800 inline
801 CRef<CSeq_annot>
GetMatchAnnot(void) const802 CCSraAlignIterator::GetMatchAnnot(void) const
803 {
804 return x_GetMatchAnnot(0);
805 }
806
807
808 inline
809 CRef<CSeq_annot>
GetQualityGraphAnnot(const string & annot_name) const810 CCSraAlignIterator::GetQualityGraphAnnot(const string& annot_name) const
811 {
812 return x_GetQualityGraphAnnot(&annot_name);
813 }
814
815
816 inline
817 CRef<CSeq_annot>
GetQualityGraphAnnot(void) const818 CCSraAlignIterator::GetQualityGraphAnnot(void) const
819 {
820 return x_GetQualityGraphAnnot(0);
821 }
822
823
824 inline
825 CRef<CSeq_annot>
x_GetSeq_annot(const string * annot_name) const826 CCSraAlignIterator::x_GetSeq_annot(const string* annot_name) const
827 {
828 return m_RefIter.x_GetSeq_annot(annot_name);
829 }
830
831
832 inline
833 CRef<CSeq_annot>
GetSeq_annot(const string & annot_name) const834 CCSraAlignIterator::GetSeq_annot(const string& annot_name) const
835 {
836 return x_GetSeq_annot(&annot_name);
837 }
838
839
840 inline
841 CRef<CSeq_annot>
GetSeq_annot(void) const842 CCSraAlignIterator::GetSeq_annot(void) const
843 {
844 return x_GetSeq_annot(0);
845 }
846
847
848 inline
849 CRef<CSeq_annot>
MakeSeq_annot(const string & annot_name)850 CCSraAlignIterator::MakeSeq_annot(const string& annot_name)
851 {
852 return CCSraRefSeqIterator::MakeSeq_annot(annot_name);
853 }
854
855
856 /////////////////////////////////////////////////////////////////////////////
857 // CCSraShortReadIterator
858
859 inline
860 CRef<CSeq_annot>
GetQualityGraphAnnot(const string & annot_name) const861 CCSraShortReadIterator::GetQualityGraphAnnot(const string& annot_name) const
862 {
863 return x_GetQualityGraphAnnot(&annot_name);
864 }
865
866
867 inline
868 CRef<CSeq_annot>
GetQualityGraphAnnot(void) const869 CCSraShortReadIterator::GetQualityGraphAnnot(void) const
870 {
871 return x_GetQualityGraphAnnot(0);
872 }
873
874
875 END_NAMESPACE(objects);
876 END_NCBI_NAMESPACE;
877
878 #endif // SRA__READER__SRA__CSRAREAD__HPP
879