1 /*  $Id: sraread.cpp 519877 2016-11-18 20:51:27Z vasilche $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Eugene Vasilchenko
27  *
28  * File Description:
29  *   Access to SRA files
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <sra/readers/sra/sraread.hpp>
35 
36 #include <klib/rc.h>
37 #include <klib/writer.h>
38 #include <sra/types.h>
39 #include <sra/impl.h>
40 #include <sra/sradb.h>
41 #include <sra/sradb-priv.h>
42 #include <vfs/manager.h>
43 #include <vfs/resolver.h>
44 #include <vfs/path.h>
45 
46 #include <corelib/ncbimtx.hpp>
47 #include <corelib/ncbi_param.hpp>
48 #include <sra/readers/ncbi_traces_path.hpp>
49 #include <objects/general/general__.hpp>
50 #include <objects/seq/seq__.hpp>
51 #include <objects/seqset/seqset__.hpp>
52 #include <objects/seqres/seqres__.hpp>
53 #include <sra/error_codes.hpp>
54 
55 #include <cstring>
56 
57 BEGIN_NCBI_SCOPE
58 
59 #define NCBI_USE_ERRCODE_X   SRAReader
60 NCBI_DEFINE_ERR_SUBCODE_X(1);
61 
62 BEGIN_SCOPE(objects)
63 
64 class CSeq_entry;
65 
66 
67 DEFINE_SRA_REF_TRAITS(SRAMgr, const);
68 DEFINE_SRA_REF_TRAITS(SRAColumn, const);
69 DEFINE_SRA_REF_TRAITS(SRATable, const);
70 
71 
CSraException(void)72 CSraException::CSraException(void)
73     : m_RC(0)
74 {
75 }
76 
77 
CSraException(const CDiagCompileInfo & info,const CException * prev_exc,EErrCode err_code,const string & message,EDiagSev severity)78 CSraException::CSraException(const CDiagCompileInfo& info,
79                              const CException* prev_exc,
80                              EErrCode err_code,
81                              const string& message,
82                              EDiagSev severity)
83     : CException(info, prev_exc, CException::EErrCode(err_code), message),
84       m_RC(0)
85 {
86     this->x_Init(info, message, prev_exc, severity);
87     this->x_InitErrCode(CException::EErrCode(err_code));
88 }
89 
90 
CSraException(const CDiagCompileInfo & info,const CException * prev_exc,EErrCode err_code,const string & message,rc_t rc,EDiagSev severity)91 CSraException::CSraException(const CDiagCompileInfo& info,
92                              const CException* prev_exc,
93                              EErrCode err_code,
94                              const string& message,
95                              rc_t rc,
96                              EDiagSev severity)
97     : CException(info, prev_exc, CException::EErrCode(err_code), message),
98       m_RC(rc)
99 {
100     this->x_Init(info, message, prev_exc, severity);
101     this->x_InitErrCode(CException::EErrCode(err_code));
102 }
103 
104 
CSraException(const CDiagCompileInfo & info,const CException * prev_exc,EErrCode err_code,const string & message,rc_t rc,const string & param,EDiagSev severity)105 CSraException::CSraException(const CDiagCompileInfo& info,
106                              const CException* prev_exc,
107                              EErrCode err_code,
108                              const string& message,
109                              rc_t rc,
110                              const string& param,
111                              EDiagSev severity)
112     : CException(info, prev_exc, CException::EErrCode(err_code), message),
113       m_RC(rc),
114       m_Param(param)
115 {
116     this->x_Init(info, message, prev_exc, severity);
117     this->x_InitErrCode(CException::EErrCode(err_code));
118 }
119 
120 
CSraException(const CDiagCompileInfo & info,const CException * prev_exc,EErrCode err_code,const string & message,rc_t rc,int64_t param,EDiagSev severity)121 CSraException::CSraException(const CDiagCompileInfo& info,
122                              const CException* prev_exc,
123                              EErrCode err_code,
124                              const string& message,
125                              rc_t rc,
126                              int64_t param,
127                              EDiagSev severity)
128     : CException(info, prev_exc, CException::EErrCode(err_code), message),
129       m_RC(rc),
130       m_Param(NStr::Int8ToString(param))
131 {
132     this->x_Init(info, message, prev_exc, severity);
133     this->x_InitErrCode(CException::EErrCode(err_code));
134 }
135 
136 
CSraException(const CSraException & other)137 CSraException::CSraException(const CSraException& other)
138     : CException( other),
139       m_RC(other.m_RC),
140       m_Param(other.m_Param)
141 {
142     x_Assign(other);
143 }
144 
145 
~CSraException(void)146 CSraException::~CSraException(void) throw()
147 {
148 }
149 
150 
x_Clone(void) const151 const CException* CSraException::x_Clone(void) const
152 {
153     return new CSraException(*this);
154 }
155 
156 
GetType(void) const157 const char* CSraException::GetType(void) const
158 {
159     return "CSraException";
160 }
161 
162 
GetErrCode(void) const163 CSraException::TErrCode CSraException::GetErrCode(void) const
164 {
165     return typeid(*this) == typeid(CSraException) ?
166         x_GetErrCode() : CException::GetErrCode();
167 }
168 
169 
GetErrCodeString(void) const170 const char* CSraException::GetErrCodeString(void) const
171 {
172     switch (GetErrCode()) {
173     case eOtherError:   return "eOtherError";
174     case eNullPtr:      return "eNullPtr";
175     case eAddRefFailed: return "eAddRefFailed";
176     case eInvalidArg:   return "eInvalidArg";
177     case eInitFailed:   return "eInitFailed";
178     case eNotFound:     return "eNotFound";
179     case eInvalidState: return "eInvalidState";
180     case eInvalidIndex: return "eInvalidIndex";
181     case eNotFoundDb:   return "eNotFoundDb";
182     case eNotFoundTable: return "eNotFoundTable";
183     case eNotFoundColumn: return "eNotFoundColumn";
184     case eNotFoundValue: return "eNotFoundValue";
185     case eDataError:    return "eDataError";
186     case eNotFoundIndex: return "eNotFoundIndex";
187     case eProtectedDb:  return "eProtectedDb";
188     default:            return CException::GetErrCodeString();
189     }
190 }
191 
192 
operator <<(ostream & out,const CSraRcFormatter & rc)193 ostream& operator<<(ostream& out, const CSraRcFormatter& rc)
194 {
195     char buffer[1024];
196     size_t error_len;
197     RCExplain(rc.GetRC(), buffer, sizeof(buffer), &error_len);
198     out << "0x" << hex << rc.GetRC() << dec << ": " << buffer;
199     return out;
200 }
201 
202 
ReportExtra(ostream & out) const203 void CSraException::ReportExtra(ostream& out) const
204 {
205     if ( m_RC ) {
206         out << CSraRcFormatter(m_RC);
207     }
208     if ( !m_Param.empty() ) {
209         if ( m_RC ) {
210             out << ": ";
211         }
212         out << m_Param;
213     }
214 }
215 
216 
ReportError(const char * msg,rc_t rc)217 void CSraException::ReportError(const char* msg, rc_t rc)
218 {
219     ERR_POST_X(1, msg<<": "<<CSraRcFormatter(rc));
220 }
221 
222 
CSraPath(void)223 CSraPath::CSraPath(void)
224 {
225 }
226 
227 
CSraPath(const string &,const string &)228 CSraPath::CSraPath(const string& /*rep_path*/, const string& /*vol_path*/)
229 {
230 }
231 
232 
233 NCBI_PARAM_DECL(string, SRA, REP_PATH);
234 NCBI_PARAM_DEF(string, SRA, REP_PATH, "");
235 
236 
237 NCBI_PARAM_DECL(string, SRA, VOL_PATH);
238 NCBI_PARAM_DEF(string, SRA, VOL_PATH, "");
239 
240 
GetDefaultRepPath(void)241 string CSraPath::GetDefaultRepPath(void)
242 {
243     return NCBI_PARAM_TYPE(SRA, REP_PATH)::GetDefault();
244 }
245 
246 
GetDefaultVolPath(void)247 string CSraPath::GetDefaultVolPath(void)
248 {
249     return NCBI_PARAM_TYPE(SRA, VOL_PATH)::GetDefault();
250 }
251 
252 
253 DEFINE_STATIC_FAST_MUTEX(sx_PathMutex);
254 
255 
AddRepPath(const string & rep_path)256 void CSraPath::AddRepPath(const string& rep_path)
257 {
258 }
259 
260 
AddVolPath(const string & vol_path)261 void CSraPath::AddVolPath(const string& vol_path)
262 {
263 }
264 
265 
FindAccPath(const string & acc) const266 string CSraPath::FindAccPath(const string& acc) const
267 {
268     string ret;
269     ret.resize(128);
270     rc_t rc;
271     {{
272         CFastMutexGuard guard(sx_PathMutex);
273         //rc = SRAPathFind(*this, acc.c_str(), &ret[0], ret.size());
274         VFSManager* mgr;
275         rc = VFSManagerMake(&mgr);
276         if (rc == 0) {
277             rc_t rc2;
278             VResolver* res;
279             rc = VFSManagerGetResolver(mgr, &res);
280             if (rc == 0) {
281                 VPath* accPath;
282                 rc = VFSManagerMakePath(mgr, &accPath, acc.c_str());
283                 if (rc == 0)
284                 {
285                     const VPath* resolvedPath;
286                     rc = VResolverQuery(res, eProtocolHttp, accPath, &resolvedPath, NULL, NULL);
287                     if (rc == 0)
288                     {
289                         String s;
290                         VPathGetPath(resolvedPath, &s);
291                         ret.assign(s.addr, s.len);
292                         rc2 = VPathRelease(resolvedPath);
293                         if (rc == 0)
294                             rc = rc2;
295                     }
296                     rc2 = VPathRelease(accPath);
297                     if (rc == 0)
298                         rc = rc2;
299                 }
300                 rc2 = VResolverRelease(res);
301                 if (rc == 0)
302                     rc = rc2;
303             }
304             rc2 = VFSManagerRelease(mgr);
305             if (rc == 0)
306                 rc = rc2;
307         }
308     }}
309     if ( rc ) {
310         NCBI_THROW3(CSraException, eNotFound,
311                     "Cannot find acc path", rc, acc);
312     }
313     SIZE_TYPE eol_pos = ret.find('\0');
314     if ( eol_pos != NPOS ) {
315         ret.resize(eol_pos);
316     }
317     return ret;
318 }
319 
320 
CSraMgr(ETrim trim)321 CSraMgr::CSraMgr(ETrim trim)
322     : m_Path(null),
323       m_Trim(trim == eTrim)
324 {
325     x_Init();
326 }
327 
328 
CSraMgr(const string &,const string &,ETrim trim)329 CSraMgr::CSraMgr(const string& /*rep_path*/, const string& /*vol_path*/,
330                  ETrim trim)
331     : m_Path(null),
332       m_Trim(trim == eTrim)
333 {
334     x_Init();
335 }
336 
337 
RegisterFunctions(void)338 void CSraMgr::RegisterFunctions(void)
339 {
340 }
341 
342 
x_Init(void)343 void CSraMgr::x_Init(void)
344 {
345     if ( rc_t rc = SRAMgrMakeRead(x_InitPtr()) ) {
346         *x_InitPtr() = 0;
347         NCBI_THROW2(CSraException, eInitFailed,
348                     "Cannot open SRAMgr", rc);
349     }
350 }
351 
352 
GetPath(void) const353 CSraPath& CSraMgr::GetPath(void) const
354 {
355     if ( !m_Path ) {
356         CFastMutexGuard guard(sx_PathMutex);
357         if ( !m_Path ) {
358             m_Path = CSraPath();
359         }
360     }
361     return m_Path;
362 }
363 
364 
FindAccPath(const string & acc) const365 string CSraMgr::FindAccPath(const string& acc) const
366 {
367     return GetPath().FindAccPath(acc);
368 }
369 
370 
GetSpotInfo(const string & sra,CSraRun & run)371 spotid_t CSraMgr::GetSpotInfo(const string& sra, CSraRun& run)
372 {
373     SIZE_TYPE dot = sra.find('.');
374     string acc;
375     spotid_t spot_id = 0;
376     if ( dot == NPOS ) {
377         NCBI_THROW3(CSraException, eInvalidArg,
378                     "No spot id specified",
379                     RC(rcApp, rcFunctParam, rcDecoding, rcParam, rcIncomplete),
380                     sra);
381     }
382     else {
383         acc = sra.substr(0, dot);
384         spot_id = NStr::StringToUInt(sra.substr(dot+1));
385     }
386     if ( !run || run.GetAccession() != acc ) {
387         run = CSraRun(*this, acc);
388     }
389     return spot_id;
390 }
391 
392 
GetSpotEntry(const string & sra,CSraRun & run)393 CRef<CSeq_entry> CSraMgr::GetSpotEntry(const string& sra,
394                                        CSraRun& run)
395 {
396     return run.GetSpotEntry(GetSpotInfo(sra, run));
397 }
398 
399 
GetSpotEntry(const string & sra)400 CRef<CSeq_entry> CSraMgr::GetSpotEntry(const string& sra)
401 {
402     CSraRun run;
403     return GetSpotEntry(sra, run);
404 }
405 
406 
CSraValue(const CSraColumn & col,spotid_t id,ECheckRc check_rc)407 CSraValue::CSraValue(const CSraColumn& col, spotid_t id, ECheckRc check_rc)
408     : m_Error(0), m_Data(0), m_Bitoffset(0), m_Bitlen(0), m_Len(0)
409 {
410     m_Error = SRAColumnRead(col, id, &m_Data, &m_Bitoffset, &m_Bitlen);
411     if ( !m_Error ) {
412         if ( m_Bitoffset ) {
413             m_Error = RC(rcApp, rcColumn, rcDecoding, rcOffset, rcUnsupported);
414         }
415         else {
416             uint64_t len = (m_Bitlen+7)>>3;
417 #if SIZEOF_SIZE_T == 4
418             m_Len = size_t(len);
419             if ( m_Len != len ) {
420                 m_Error = RC(rcApp, rcColumn, rcDecoding, rcSize, rcUnsupported);
421             }
422 #else
423             m_Len = len;
424 #endif
425         }
426     }
427     if ( m_Error && check_rc == eCheckRc ) {
428         NCBI_THROW3(CSraException, eNotFoundValue, "Cannot read value",
429                     m_Error, NStr::NumericToString(id));
430     }
431 }
432 
433 
Init(CSraMgr & mgr,const string & acc)434 void CSraRun::Init(CSraMgr& mgr, const string& acc)
435 {
436     m_Acc = acc;
437     m_Trim = mgr.GetTrim();
438     x_DoInit(mgr, acc);
439 }
440 
441 
x_DoInit(CSraMgr & mgr,const string & acc)442 void CSraRun::x_DoInit(CSraMgr& mgr, const string& acc)
443 {
444     if ( rc_t rc = SRAMgrOpenTableRead(mgr, x_InitPtr(), "%.*s",
445                                        int(acc.size()), acc.data()) ) {
446         *x_InitPtr() = 0;
447         NCBI_THROW3(CSraException, eNotFoundDb,
448                     "Cannot open run read", rc, acc);
449     }
450     m_Name.Init(*this, "NAME", vdb_ascii_t);
451     m_Read.Init(*this, "READ", insdc_fasta_t);
452     m_Qual.Init(*this, "QUALITY", ncbi_qual1_t);
453     m_SDesc.Init(*this, "SPOT_DESC", sra_spot_desc_t);
454     m_RDesc.Init(*this, "READ_DESC", sra_read_desc_t);
455     m_TrimStart.TryInitRc(*this, "TRIM_START", "INSDC:coord:zero");
456 }
457 
458 
TryInitRc(const CSraRun & run,const char * name,const char * type)459 rc_t CSraColumn::TryInitRc(const CSraRun& run,
460                            const char* name, const char* type)
461 {
462     return SRATableOpenColumnRead(run, x_InitPtr(), name, type);
463 }
464 
465 
Init(const CSraRun & run,const char * name,const char * type)466 void CSraColumn::Init(const CSraRun& run,
467                       const char* name, const char* type)
468 {
469     if ( rc_t rc = TryInitRc(run, name, type) ) {
470         *x_InitPtr() = 0;
471         NCBI_THROW3(CSraException, eInitFailed,
472                     "Cannot get SRA column", rc, name);
473     }
474 }
475 
476 
GetSpotEntry(spotid_t spot_id) const477 CRef<CSeq_entry> CSraRun::GetSpotEntry(spotid_t spot_id) const
478 {
479     CRef<CSeq_entry> entry;
480 
481     CSraStringValue name(m_Name, spot_id);
482 
483     entry = new CSeq_entry();
484     CBioseq_set& seqset = entry->SetSet();
485     seqset.SetLevel(0);
486     seqset.SetClass(seqset.eClass_other);
487 
488     CSraValueFor<SRASpotDesc> sdesc(m_SDesc, spot_id);
489     TSeqPos trim_start = m_Trim && m_TrimStart?
490         CSraValueFor<INSDC_coord_zero>(m_TrimStart, spot_id).Value(): 0;
491     TSeqPos trim_end = sdesc->clip_qual_right;
492 
493     CSraValueFor<SRAReadDesc> rdesc(m_RDesc, spot_id);
494     CSraStringValue read(m_Read, spot_id);
495     CSraBytesValue qual(m_Qual, spot_id);
496     int seq_count = 0;
497     string id_start = GetAccession()+'.'+NStr::NumericToString(spot_id)+'.';
498     for ( int r = 0; r < sdesc->num_reads; ++r ) {
499         if ( rdesc[r].type != SRA_READ_TYPE_BIOLOGICAL ) {
500             continue;
501         }
502         TSeqPos len = rdesc[r].seg.len;
503         if ( len == 0 ) {
504             continue;
505         }
506         TSeqPos start = rdesc[r].seg.start;
507         TSeqPos end = start + len;
508         if ( m_Trim ) {
509             start = max(start, trim_start);
510             end = min(end, trim_end);
511             if ( start >= end ) {
512                 continue;
513             }
514             len = end - start;
515         }
516 
517         CRef<CSeq_entry> seq_entry(new CSeq_entry);
518         CBioseq& seq = seq_entry->SetSeq();
519 
520         CRef<CSeq_id> id(new CSeq_id);
521         id->SetGeneral().SetDb("SRA");
522         id->SetGeneral().SetTag().SetStr(id_start+NStr::UIntToString(r+1));
523         seq.SetId().push_back(id);
524 
525         {{
526             CRef<CSeqdesc> desc(new CSeqdesc);
527             desc->SetTitle(name.Value());
528             seq.SetDescr().Set().push_back(desc);
529         }}
530         {{
531             CSeq_inst& inst = seq.SetInst();
532             inst.SetRepr(inst.eRepr_raw);
533             inst.SetMol(inst.eMol_na);
534             inst.SetLength(len);
535             inst.SetSeq_data().SetIupacna().Set()
536                 .assign(read.data()+start, len);
537         }}
538         {{
539             CRef<CSeq_annot> annot(new CSeq_annot);
540             CRef<CSeq_graph> graph(new CSeq_graph);
541             annot->SetData().SetGraph().push_back(graph);
542             graph->SetTitle("Phred Quality");
543             graph->SetLoc().SetWhole(*id);
544             graph->SetNumval(len);
545             CByte_graph& bytes = graph->SetGraph().SetByte();
546             bytes.SetAxis(0);
547             CByte_graph::TValues& values = bytes.SetValues();
548             values.reserve(len);
549             int min = kMax_Int;
550             int max = kMin_Int;
551             for ( size_t i = 0; i < len; ++i ) {
552                 int v = qual[start+i];
553                 values.push_back(v);
554                 if ( v < min ) {
555                     min = v;
556                 }
557                 if ( v > max ) {
558                     max = v;
559                 }
560             }
561             bytes.SetMin(min);
562             bytes.SetMax(max);
563 
564             seq.SetAnnot().push_back(annot);
565         }}
566 
567         seqset.SetSeq_set().push_back(seq_entry);
568         ++seq_count;
569     }
570     switch ( seq_count ) {
571     case 0:
572         entry.Reset();
573         break;
574     case 1:
575         entry = seqset.GetSeq_set().front();
576         break;
577     }
578     return entry;
579 }
580 
581 
GetSequenceType(spotid_t spot_id,uint8_t read_id) const582 CSeq_inst::TMol CSraRun::GetSequenceType(spotid_t spot_id,
583                                          uint8_t read_id) const
584 {
585     CSraValueFor<SRASpotDesc> sdesc(m_SDesc, spot_id);
586     if ( read_id == 0 || read_id > sdesc->num_reads ) {
587         return CSeq_inst::eMol_not_set;
588     }
589     return CSeq_inst::eMol_na;
590 }
591 
592 
GetSequenceLength(spotid_t spot_id,uint8_t read_id) const593 TSeqPos CSraRun::GetSequenceLength(spotid_t spot_id,
594                                    uint8_t read_id) const
595 {
596     CSraValueFor<SRASpotDesc> sdesc(m_SDesc, spot_id);
597     if ( read_id == 0 || read_id > sdesc->num_reads ) {
598         return kInvalidSeqPos;
599     }
600     TSeqPos trim_start = m_Trim && m_TrimStart?
601         CSraValueFor<INSDC_coord_zero>(m_TrimStart, spot_id).Value(): 0;
602     TSeqPos trim_end = sdesc->clip_qual_right;
603     CSraValueFor<SRAReadDesc> rdesc(m_RDesc, spot_id);
604     size_t r = read_id-1;
605     TSeqPos len = rdesc[r].seg.len;
606     if ( len == 0 ) {
607         return kInvalidSeqPos;
608     }
609     TSeqPos start = rdesc[r].seg.start;
610     TSeqPos end = start + len;
611     if ( m_Trim ) {
612         start = max(start, trim_start);
613         end = min(end, trim_end);
614         if ( start >= end ) {
615             return kInvalidSeqPos;
616         }
617         len = end - start;
618     }
619     return len;
620 }
621 
622 
623 END_SCOPE(objects)
624 END_NCBI_SCOPE
625