1 /* $Id: sraread.cpp 519877 2016-11-18 20:51:27Z vasilche $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Eugene Vasilchenko
27 *
28 * File Description:
29 * Access to SRA files
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <sra/readers/sra/sraread.hpp>
35
36 #include <klib/rc.h>
37 #include <klib/writer.h>
38 #include <sra/types.h>
39 #include <sra/impl.h>
40 #include <sra/sradb.h>
41 #include <sra/sradb-priv.h>
42 #include <vfs/manager.h>
43 #include <vfs/resolver.h>
44 #include <vfs/path.h>
45
46 #include <corelib/ncbimtx.hpp>
47 #include <corelib/ncbi_param.hpp>
48 #include <sra/readers/ncbi_traces_path.hpp>
49 #include <objects/general/general__.hpp>
50 #include <objects/seq/seq__.hpp>
51 #include <objects/seqset/seqset__.hpp>
52 #include <objects/seqres/seqres__.hpp>
53 #include <sra/error_codes.hpp>
54
55 #include <cstring>
56
57 BEGIN_NCBI_SCOPE
58
59 #define NCBI_USE_ERRCODE_X SRAReader
60 NCBI_DEFINE_ERR_SUBCODE_X(1);
61
62 BEGIN_SCOPE(objects)
63
64 class CSeq_entry;
65
66
67 DEFINE_SRA_REF_TRAITS(SRAMgr, const);
68 DEFINE_SRA_REF_TRAITS(SRAColumn, const);
69 DEFINE_SRA_REF_TRAITS(SRATable, const);
70
71
CSraException(void)72 CSraException::CSraException(void)
73 : m_RC(0)
74 {
75 }
76
77
CSraException(const CDiagCompileInfo & info,const CException * prev_exc,EErrCode err_code,const string & message,EDiagSev severity)78 CSraException::CSraException(const CDiagCompileInfo& info,
79 const CException* prev_exc,
80 EErrCode err_code,
81 const string& message,
82 EDiagSev severity)
83 : CException(info, prev_exc, CException::EErrCode(err_code), message),
84 m_RC(0)
85 {
86 this->x_Init(info, message, prev_exc, severity);
87 this->x_InitErrCode(CException::EErrCode(err_code));
88 }
89
90
CSraException(const CDiagCompileInfo & info,const CException * prev_exc,EErrCode err_code,const string & message,rc_t rc,EDiagSev severity)91 CSraException::CSraException(const CDiagCompileInfo& info,
92 const CException* prev_exc,
93 EErrCode err_code,
94 const string& message,
95 rc_t rc,
96 EDiagSev severity)
97 : CException(info, prev_exc, CException::EErrCode(err_code), message),
98 m_RC(rc)
99 {
100 this->x_Init(info, message, prev_exc, severity);
101 this->x_InitErrCode(CException::EErrCode(err_code));
102 }
103
104
CSraException(const CDiagCompileInfo & info,const CException * prev_exc,EErrCode err_code,const string & message,rc_t rc,const string & param,EDiagSev severity)105 CSraException::CSraException(const CDiagCompileInfo& info,
106 const CException* prev_exc,
107 EErrCode err_code,
108 const string& message,
109 rc_t rc,
110 const string& param,
111 EDiagSev severity)
112 : CException(info, prev_exc, CException::EErrCode(err_code), message),
113 m_RC(rc),
114 m_Param(param)
115 {
116 this->x_Init(info, message, prev_exc, severity);
117 this->x_InitErrCode(CException::EErrCode(err_code));
118 }
119
120
CSraException(const CDiagCompileInfo & info,const CException * prev_exc,EErrCode err_code,const string & message,rc_t rc,int64_t param,EDiagSev severity)121 CSraException::CSraException(const CDiagCompileInfo& info,
122 const CException* prev_exc,
123 EErrCode err_code,
124 const string& message,
125 rc_t rc,
126 int64_t param,
127 EDiagSev severity)
128 : CException(info, prev_exc, CException::EErrCode(err_code), message),
129 m_RC(rc),
130 m_Param(NStr::Int8ToString(param))
131 {
132 this->x_Init(info, message, prev_exc, severity);
133 this->x_InitErrCode(CException::EErrCode(err_code));
134 }
135
136
CSraException(const CSraException & other)137 CSraException::CSraException(const CSraException& other)
138 : CException( other),
139 m_RC(other.m_RC),
140 m_Param(other.m_Param)
141 {
142 x_Assign(other);
143 }
144
145
~CSraException(void)146 CSraException::~CSraException(void) throw()
147 {
148 }
149
150
x_Clone(void) const151 const CException* CSraException::x_Clone(void) const
152 {
153 return new CSraException(*this);
154 }
155
156
GetType(void) const157 const char* CSraException::GetType(void) const
158 {
159 return "CSraException";
160 }
161
162
GetErrCode(void) const163 CSraException::TErrCode CSraException::GetErrCode(void) const
164 {
165 return typeid(*this) == typeid(CSraException) ?
166 x_GetErrCode() : CException::GetErrCode();
167 }
168
169
GetErrCodeString(void) const170 const char* CSraException::GetErrCodeString(void) const
171 {
172 switch (GetErrCode()) {
173 case eOtherError: return "eOtherError";
174 case eNullPtr: return "eNullPtr";
175 case eAddRefFailed: return "eAddRefFailed";
176 case eInvalidArg: return "eInvalidArg";
177 case eInitFailed: return "eInitFailed";
178 case eNotFound: return "eNotFound";
179 case eInvalidState: return "eInvalidState";
180 case eInvalidIndex: return "eInvalidIndex";
181 case eNotFoundDb: return "eNotFoundDb";
182 case eNotFoundTable: return "eNotFoundTable";
183 case eNotFoundColumn: return "eNotFoundColumn";
184 case eNotFoundValue: return "eNotFoundValue";
185 case eDataError: return "eDataError";
186 case eNotFoundIndex: return "eNotFoundIndex";
187 case eProtectedDb: return "eProtectedDb";
188 default: return CException::GetErrCodeString();
189 }
190 }
191
192
operator <<(ostream & out,const CSraRcFormatter & rc)193 ostream& operator<<(ostream& out, const CSraRcFormatter& rc)
194 {
195 char buffer[1024];
196 size_t error_len;
197 RCExplain(rc.GetRC(), buffer, sizeof(buffer), &error_len);
198 out << "0x" << hex << rc.GetRC() << dec << ": " << buffer;
199 return out;
200 }
201
202
ReportExtra(ostream & out) const203 void CSraException::ReportExtra(ostream& out) const
204 {
205 if ( m_RC ) {
206 out << CSraRcFormatter(m_RC);
207 }
208 if ( !m_Param.empty() ) {
209 if ( m_RC ) {
210 out << ": ";
211 }
212 out << m_Param;
213 }
214 }
215
216
ReportError(const char * msg,rc_t rc)217 void CSraException::ReportError(const char* msg, rc_t rc)
218 {
219 ERR_POST_X(1, msg<<": "<<CSraRcFormatter(rc));
220 }
221
222
CSraPath(void)223 CSraPath::CSraPath(void)
224 {
225 }
226
227
CSraPath(const string &,const string &)228 CSraPath::CSraPath(const string& /*rep_path*/, const string& /*vol_path*/)
229 {
230 }
231
232
233 NCBI_PARAM_DECL(string, SRA, REP_PATH);
234 NCBI_PARAM_DEF(string, SRA, REP_PATH, "");
235
236
237 NCBI_PARAM_DECL(string, SRA, VOL_PATH);
238 NCBI_PARAM_DEF(string, SRA, VOL_PATH, "");
239
240
GetDefaultRepPath(void)241 string CSraPath::GetDefaultRepPath(void)
242 {
243 return NCBI_PARAM_TYPE(SRA, REP_PATH)::GetDefault();
244 }
245
246
GetDefaultVolPath(void)247 string CSraPath::GetDefaultVolPath(void)
248 {
249 return NCBI_PARAM_TYPE(SRA, VOL_PATH)::GetDefault();
250 }
251
252
253 DEFINE_STATIC_FAST_MUTEX(sx_PathMutex);
254
255
AddRepPath(const string & rep_path)256 void CSraPath::AddRepPath(const string& rep_path)
257 {
258 }
259
260
AddVolPath(const string & vol_path)261 void CSraPath::AddVolPath(const string& vol_path)
262 {
263 }
264
265
FindAccPath(const string & acc) const266 string CSraPath::FindAccPath(const string& acc) const
267 {
268 string ret;
269 ret.resize(128);
270 rc_t rc;
271 {{
272 CFastMutexGuard guard(sx_PathMutex);
273 //rc = SRAPathFind(*this, acc.c_str(), &ret[0], ret.size());
274 VFSManager* mgr;
275 rc = VFSManagerMake(&mgr);
276 if (rc == 0) {
277 rc_t rc2;
278 VResolver* res;
279 rc = VFSManagerGetResolver(mgr, &res);
280 if (rc == 0) {
281 VPath* accPath;
282 rc = VFSManagerMakePath(mgr, &accPath, acc.c_str());
283 if (rc == 0)
284 {
285 const VPath* resolvedPath;
286 rc = VResolverQuery(res, eProtocolHttp, accPath, &resolvedPath, NULL, NULL);
287 if (rc == 0)
288 {
289 String s;
290 VPathGetPath(resolvedPath, &s);
291 ret.assign(s.addr, s.len);
292 rc2 = VPathRelease(resolvedPath);
293 if (rc == 0)
294 rc = rc2;
295 }
296 rc2 = VPathRelease(accPath);
297 if (rc == 0)
298 rc = rc2;
299 }
300 rc2 = VResolverRelease(res);
301 if (rc == 0)
302 rc = rc2;
303 }
304 rc2 = VFSManagerRelease(mgr);
305 if (rc == 0)
306 rc = rc2;
307 }
308 }}
309 if ( rc ) {
310 NCBI_THROW3(CSraException, eNotFound,
311 "Cannot find acc path", rc, acc);
312 }
313 SIZE_TYPE eol_pos = ret.find('\0');
314 if ( eol_pos != NPOS ) {
315 ret.resize(eol_pos);
316 }
317 return ret;
318 }
319
320
CSraMgr(ETrim trim)321 CSraMgr::CSraMgr(ETrim trim)
322 : m_Path(null),
323 m_Trim(trim == eTrim)
324 {
325 x_Init();
326 }
327
328
CSraMgr(const string &,const string &,ETrim trim)329 CSraMgr::CSraMgr(const string& /*rep_path*/, const string& /*vol_path*/,
330 ETrim trim)
331 : m_Path(null),
332 m_Trim(trim == eTrim)
333 {
334 x_Init();
335 }
336
337
RegisterFunctions(void)338 void CSraMgr::RegisterFunctions(void)
339 {
340 }
341
342
x_Init(void)343 void CSraMgr::x_Init(void)
344 {
345 if ( rc_t rc = SRAMgrMakeRead(x_InitPtr()) ) {
346 *x_InitPtr() = 0;
347 NCBI_THROW2(CSraException, eInitFailed,
348 "Cannot open SRAMgr", rc);
349 }
350 }
351
352
GetPath(void) const353 CSraPath& CSraMgr::GetPath(void) const
354 {
355 if ( !m_Path ) {
356 CFastMutexGuard guard(sx_PathMutex);
357 if ( !m_Path ) {
358 m_Path = CSraPath();
359 }
360 }
361 return m_Path;
362 }
363
364
FindAccPath(const string & acc) const365 string CSraMgr::FindAccPath(const string& acc) const
366 {
367 return GetPath().FindAccPath(acc);
368 }
369
370
GetSpotInfo(const string & sra,CSraRun & run)371 spotid_t CSraMgr::GetSpotInfo(const string& sra, CSraRun& run)
372 {
373 SIZE_TYPE dot = sra.find('.');
374 string acc;
375 spotid_t spot_id = 0;
376 if ( dot == NPOS ) {
377 NCBI_THROW3(CSraException, eInvalidArg,
378 "No spot id specified",
379 RC(rcApp, rcFunctParam, rcDecoding, rcParam, rcIncomplete),
380 sra);
381 }
382 else {
383 acc = sra.substr(0, dot);
384 spot_id = NStr::StringToUInt(sra.substr(dot+1));
385 }
386 if ( !run || run.GetAccession() != acc ) {
387 run = CSraRun(*this, acc);
388 }
389 return spot_id;
390 }
391
392
GetSpotEntry(const string & sra,CSraRun & run)393 CRef<CSeq_entry> CSraMgr::GetSpotEntry(const string& sra,
394 CSraRun& run)
395 {
396 return run.GetSpotEntry(GetSpotInfo(sra, run));
397 }
398
399
GetSpotEntry(const string & sra)400 CRef<CSeq_entry> CSraMgr::GetSpotEntry(const string& sra)
401 {
402 CSraRun run;
403 return GetSpotEntry(sra, run);
404 }
405
406
CSraValue(const CSraColumn & col,spotid_t id,ECheckRc check_rc)407 CSraValue::CSraValue(const CSraColumn& col, spotid_t id, ECheckRc check_rc)
408 : m_Error(0), m_Data(0), m_Bitoffset(0), m_Bitlen(0), m_Len(0)
409 {
410 m_Error = SRAColumnRead(col, id, &m_Data, &m_Bitoffset, &m_Bitlen);
411 if ( !m_Error ) {
412 if ( m_Bitoffset ) {
413 m_Error = RC(rcApp, rcColumn, rcDecoding, rcOffset, rcUnsupported);
414 }
415 else {
416 uint64_t len = (m_Bitlen+7)>>3;
417 #if SIZEOF_SIZE_T == 4
418 m_Len = size_t(len);
419 if ( m_Len != len ) {
420 m_Error = RC(rcApp, rcColumn, rcDecoding, rcSize, rcUnsupported);
421 }
422 #else
423 m_Len = len;
424 #endif
425 }
426 }
427 if ( m_Error && check_rc == eCheckRc ) {
428 NCBI_THROW3(CSraException, eNotFoundValue, "Cannot read value",
429 m_Error, NStr::NumericToString(id));
430 }
431 }
432
433
Init(CSraMgr & mgr,const string & acc)434 void CSraRun::Init(CSraMgr& mgr, const string& acc)
435 {
436 m_Acc = acc;
437 m_Trim = mgr.GetTrim();
438 x_DoInit(mgr, acc);
439 }
440
441
x_DoInit(CSraMgr & mgr,const string & acc)442 void CSraRun::x_DoInit(CSraMgr& mgr, const string& acc)
443 {
444 if ( rc_t rc = SRAMgrOpenTableRead(mgr, x_InitPtr(), "%.*s",
445 int(acc.size()), acc.data()) ) {
446 *x_InitPtr() = 0;
447 NCBI_THROW3(CSraException, eNotFoundDb,
448 "Cannot open run read", rc, acc);
449 }
450 m_Name.Init(*this, "NAME", vdb_ascii_t);
451 m_Read.Init(*this, "READ", insdc_fasta_t);
452 m_Qual.Init(*this, "QUALITY", ncbi_qual1_t);
453 m_SDesc.Init(*this, "SPOT_DESC", sra_spot_desc_t);
454 m_RDesc.Init(*this, "READ_DESC", sra_read_desc_t);
455 m_TrimStart.TryInitRc(*this, "TRIM_START", "INSDC:coord:zero");
456 }
457
458
TryInitRc(const CSraRun & run,const char * name,const char * type)459 rc_t CSraColumn::TryInitRc(const CSraRun& run,
460 const char* name, const char* type)
461 {
462 return SRATableOpenColumnRead(run, x_InitPtr(), name, type);
463 }
464
465
Init(const CSraRun & run,const char * name,const char * type)466 void CSraColumn::Init(const CSraRun& run,
467 const char* name, const char* type)
468 {
469 if ( rc_t rc = TryInitRc(run, name, type) ) {
470 *x_InitPtr() = 0;
471 NCBI_THROW3(CSraException, eInitFailed,
472 "Cannot get SRA column", rc, name);
473 }
474 }
475
476
GetSpotEntry(spotid_t spot_id) const477 CRef<CSeq_entry> CSraRun::GetSpotEntry(spotid_t spot_id) const
478 {
479 CRef<CSeq_entry> entry;
480
481 CSraStringValue name(m_Name, spot_id);
482
483 entry = new CSeq_entry();
484 CBioseq_set& seqset = entry->SetSet();
485 seqset.SetLevel(0);
486 seqset.SetClass(seqset.eClass_other);
487
488 CSraValueFor<SRASpotDesc> sdesc(m_SDesc, spot_id);
489 TSeqPos trim_start = m_Trim && m_TrimStart?
490 CSraValueFor<INSDC_coord_zero>(m_TrimStart, spot_id).Value(): 0;
491 TSeqPos trim_end = sdesc->clip_qual_right;
492
493 CSraValueFor<SRAReadDesc> rdesc(m_RDesc, spot_id);
494 CSraStringValue read(m_Read, spot_id);
495 CSraBytesValue qual(m_Qual, spot_id);
496 int seq_count = 0;
497 string id_start = GetAccession()+'.'+NStr::NumericToString(spot_id)+'.';
498 for ( int r = 0; r < sdesc->num_reads; ++r ) {
499 if ( rdesc[r].type != SRA_READ_TYPE_BIOLOGICAL ) {
500 continue;
501 }
502 TSeqPos len = rdesc[r].seg.len;
503 if ( len == 0 ) {
504 continue;
505 }
506 TSeqPos start = rdesc[r].seg.start;
507 TSeqPos end = start + len;
508 if ( m_Trim ) {
509 start = max(start, trim_start);
510 end = min(end, trim_end);
511 if ( start >= end ) {
512 continue;
513 }
514 len = end - start;
515 }
516
517 CRef<CSeq_entry> seq_entry(new CSeq_entry);
518 CBioseq& seq = seq_entry->SetSeq();
519
520 CRef<CSeq_id> id(new CSeq_id);
521 id->SetGeneral().SetDb("SRA");
522 id->SetGeneral().SetTag().SetStr(id_start+NStr::UIntToString(r+1));
523 seq.SetId().push_back(id);
524
525 {{
526 CRef<CSeqdesc> desc(new CSeqdesc);
527 desc->SetTitle(name.Value());
528 seq.SetDescr().Set().push_back(desc);
529 }}
530 {{
531 CSeq_inst& inst = seq.SetInst();
532 inst.SetRepr(inst.eRepr_raw);
533 inst.SetMol(inst.eMol_na);
534 inst.SetLength(len);
535 inst.SetSeq_data().SetIupacna().Set()
536 .assign(read.data()+start, len);
537 }}
538 {{
539 CRef<CSeq_annot> annot(new CSeq_annot);
540 CRef<CSeq_graph> graph(new CSeq_graph);
541 annot->SetData().SetGraph().push_back(graph);
542 graph->SetTitle("Phred Quality");
543 graph->SetLoc().SetWhole(*id);
544 graph->SetNumval(len);
545 CByte_graph& bytes = graph->SetGraph().SetByte();
546 bytes.SetAxis(0);
547 CByte_graph::TValues& values = bytes.SetValues();
548 values.reserve(len);
549 int min = kMax_Int;
550 int max = kMin_Int;
551 for ( size_t i = 0; i < len; ++i ) {
552 int v = qual[start+i];
553 values.push_back(v);
554 if ( v < min ) {
555 min = v;
556 }
557 if ( v > max ) {
558 max = v;
559 }
560 }
561 bytes.SetMin(min);
562 bytes.SetMax(max);
563
564 seq.SetAnnot().push_back(annot);
565 }}
566
567 seqset.SetSeq_set().push_back(seq_entry);
568 ++seq_count;
569 }
570 switch ( seq_count ) {
571 case 0:
572 entry.Reset();
573 break;
574 case 1:
575 entry = seqset.GetSeq_set().front();
576 break;
577 }
578 return entry;
579 }
580
581
GetSequenceType(spotid_t spot_id,uint8_t read_id) const582 CSeq_inst::TMol CSraRun::GetSequenceType(spotid_t spot_id,
583 uint8_t read_id) const
584 {
585 CSraValueFor<SRASpotDesc> sdesc(m_SDesc, spot_id);
586 if ( read_id == 0 || read_id > sdesc->num_reads ) {
587 return CSeq_inst::eMol_not_set;
588 }
589 return CSeq_inst::eMol_na;
590 }
591
592
GetSequenceLength(spotid_t spot_id,uint8_t read_id) const593 TSeqPos CSraRun::GetSequenceLength(spotid_t spot_id,
594 uint8_t read_id) const
595 {
596 CSraValueFor<SRASpotDesc> sdesc(m_SDesc, spot_id);
597 if ( read_id == 0 || read_id > sdesc->num_reads ) {
598 return kInvalidSeqPos;
599 }
600 TSeqPos trim_start = m_Trim && m_TrimStart?
601 CSraValueFor<INSDC_coord_zero>(m_TrimStart, spot_id).Value(): 0;
602 TSeqPos trim_end = sdesc->clip_qual_right;
603 CSraValueFor<SRAReadDesc> rdesc(m_RDesc, spot_id);
604 size_t r = read_id-1;
605 TSeqPos len = rdesc[r].seg.len;
606 if ( len == 0 ) {
607 return kInvalidSeqPos;
608 }
609 TSeqPos start = rdesc[r].seg.start;
610 TSeqPos end = start + len;
611 if ( m_Trim ) {
612 start = max(start, trim_start);
613 end = min(end, trim_end);
614 if ( start >= end ) {
615 return kInvalidSeqPos;
616 }
617 len = end - start;
618 }
619 return len;
620 }
621
622
623 END_SCOPE(objects)
624 END_NCBI_SCOPE
625