1 /* $Id: splign_app.cpp 592172 2019-08-27 19:18:46Z vasilche $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Yuri Kapustin
27 *
28 * File Description: Splign application
29 *
30 */
31
32
33 #include <ncbi_pch.hpp>
34
35 #include "splign_app.hpp"
36 #include "splign_app_exception.hpp"
37
38 #include <corelib/ncbistd.hpp>
39 #include <corelib/ncbi_system.hpp>
40
41 #include <serial/objostrasn.hpp>
42 #include <serial/serial.hpp>
43
44 #include <algo/align/nw/align_exception.hpp>
45 #include <algo/align/nw/nw_spliced_aligner16.hpp>
46 #include <algo/align/splign/splign_cmdargs.hpp>
47 #include <algo/align/util/hit_comparator.hpp>
48
49 #include <algo/blast/api/bl2seq.hpp>
50 #include <algo/blast/api/local_blast.hpp>
51 #include <algo/blast/api/objmgr_query_data.hpp>
52 #include <algo/blast/api/local_db_adapter.hpp>
53
54 #include <objmgr/seq_vector.hpp>
55
56 #include <objects/seq/Bioseq.hpp>
57 #include <objects/seqloc/Seq_loc.hpp>
58
59 #include <objects/seqalign/Score_set.hpp>
60 #include <objects/seqalign/Score.hpp>
61
62 #include <objtools/readers/fasta.hpp>
63 #include <objtools/readers/reader_exception.hpp>
64 #include <objtools/data_loaders/blastdb/bdbloader.hpp>
65
66 #include <objtools/data_loaders/genbank/gbloader.hpp>
67
68 #include <objtools/lds2/lds2.hpp>
69 #include <objtools/data_loaders/lds2/lds2_dataloader.hpp>
70
71 #include <objtools/alnmgr/score_builder_base.hpp>
72
73 #include <algorithm>
74 #include <memory>
75
76 namespace {
77 const char kDirSense[] = "sense";
78 const char kDirAntisense[] = "antisense";
79 const char kDirBoth[] = "both";
80 const char kDirAuto[] = "auto";
81 const char kDirDefault[] = "default";
82 }
83
84
85 BEGIN_NCBI_SCOPE
86
CSplignApp(void)87 CSplignApp::CSplignApp(void)
88 {
89 auto version = Ref(&CSplign::s_GetVersion());
90 SetFullVersion(version);
91 m_AppName = version->Print("Splign");
92 }
93
94
Init()95 void CSplignApp::Init()
96 {
97 HideStdArgs(fHideHelp | fHideLogfile | fHideConffile |
98 fHideVersion | fHideFullVersion | fHideDryRun |
99 fHideXmlHelp | fHideFullHelp);
100
101 auto_ptr<CArgDescriptions> argdescr(new CArgDescriptions);
102 argdescr->SetUsageContext(GetArguments().GetProgramName(), m_AppName);
103
104 argdescr->AddOptionalKey
105 ("hits", "hits",
106 "[Batch mode] Externally computed local alignments "
107 "(such as blast hits), in blast tabular format. "
108 "The file must be collated by subject and query "
109 "(e.g. sort -k 2,2 -k 1,1).",
110 CArgDescriptions::eInputFile);
111
112 argdescr->AddOptionalKey
113 ("comps", "comps",
114 "[Batch mode] Compartments computed with Compart utility.",
115 CArgDescriptions::eInputFile);
116
117 argdescr->AddOptionalKey
118 ("mklds", "mklds",
119 "Make LDS DB under the specified directory "
120 "with cDNA and genomic FASTA files or symlinks.",
121 CArgDescriptions::eString);
122
123 argdescr->AddOptionalKey
124 ("blastdb", "blastdb",
125 "Comma separated list of Blast DBs for sequence retrieval.",
126 CArgDescriptions::eString);
127
128 argdescr->AddOptionalKey
129 ("ldsdir", "ldsdir",
130 "Comma separated list of directories with LDS DBs.",
131 CArgDescriptions::eString);
132
133 argdescr->AddFlag
134 ("genbank",
135 "Add genbank loader. Not recommended in Batch mode.");
136
137 argdescr->AddOptionalKey
138 ("query", "query",
139 "[Pairwise mode] FASTA file with the spliced sequence. "
140 "Not compatible with \"-query_id\".",
141 CArgDescriptions::eInputFile);
142
143 argdescr->AddOptionalKey
144 ("query_id", "query_id",
145 "[Pairwise mode] Query id. Not compatible with \"-query\".",
146 CArgDescriptions::eString);
147
148 argdescr->AddOptionalKey
149 ("subj", "subj",
150 "[Pairwise mode] FASTA file with the genomic sequence. "
151 "Not compatible with \"-subj_id\".",
152 CArgDescriptions::eInputFile);
153
154 argdescr->AddOptionalKey
155 ("subj_id", "subj_id",
156 "[Pairwise mode] Subject id. Not compatible with \"-subj\".",
157 CArgDescriptions::eString);
158
159 argdescr->AddFlag
160 ("disc",
161 "[Pairwise mode] Use discontiguous megablast to facilitate "
162 "alignment of more divergent sequences such as those "
163 "from different organisms (cross-species alignment).");
164
165 argdescr->AddDefaultKey
166 ("W", "mbwordsize", "[Pairwise mode] Megablast word size",
167 CArgDescriptions::eInteger,
168 "28");
169
170 argdescr->AddOptionalKey
171 ("mask_ranges", "mask_ranges", "[Pairwise mode] regions to mask out, query only, formatted as '1-2,5-8,33-44', 1-based",
172 CArgDescriptions::eString);
173
174
175 CSplignArgUtil::SetupArgDescriptions(argdescr.get());
176
177 argdescr->AddDefaultKey
178 ("direction",
179 "direction",
180 "Query sequence orientation. "
181 "Auto orientation begins with the longest ORF direction (d1) "
182 "and proceeds with the opposite direction (d2) "
183 "if found a non-consensus splice in d1 or poly-a tail in d2. "
184
185 "Default translates to 'auto' in mRNA and "
186 "'both' in EST mode",
187 CArgDescriptions::eString, kDirDefault
188 );
189
190 argdescr->AddDefaultKey("log", "log", "Splign log file",
191 CArgDescriptions::eOutputFile,
192 "splign.log");
193
194 argdescr->AddOptionalKey("asn", "asn", "ASN.1 output file name",
195 CArgDescriptions::eOutputFile);
196
197 argdescr->AddOptionalKey("aln", "aln", "Pairwise alignment output file name",
198 CArgDescriptions::eOutputFile);
199
200 CArgAllow_Strings * constrain_direction (new CArgAllow_Strings);
201 constrain_direction
202 ->Allow(kDirDefault)
203 ->Allow(kDirSense)
204 ->Allow(kDirAntisense)
205 ->Allow(kDirBoth)
206 ->Allow(kDirAuto);
207
208 argdescr->SetConstraint("direction", constrain_direction);
209
210 SetupArgDescriptions(argdescr.release());
211
212 m_ObjMgr = CObjectManager::GetInstance();
213 }
214
215
s_ReadBlastHit(const string & m8)216 CSplign::THitRef CSplignApp::s_ReadBlastHit(const string& m8)
217 {
218 THitRef rv (new CBlastTabular(m8.c_str()));
219
220 #ifdef SPLIGNAPP_UNDECORATED_ARE_LOCALS
221 // make seq-id local if no type specified in the original m8
222 string::const_iterator ie = m8.end(), i0 = m8.begin(), i1 = i0;
223 while(i1 != ie && *i1 !='\t') ++i1;
224 if(i1 != ie) {
225 string::const_iterator i2 = ++i1;
226 while(i2 != ie && *i2 !='\t') ++i2;
227 if(i2 != ie) {
228 if(find(i0, i1, '|') == i1) {
229 const string strid = rv->GetQueryId()->GetSeqIdString(true);
230 CRef<CSeq_id> seqid (new CSeq_id(CSeq_id::e_Local, strid));
231 rv->SetQueryId(seqid);
232 }
233 if(find(i1, i2, '|') == i2) {
234 const string strid = rv->GetSubjId()->GetSeqIdString(true);
235 CRef<CSeq_id> seqid (new CSeq_id(CSeq_id::e_Local, strid));
236 rv->SetSubjId(seqid);
237 }
238 return rv;
239 }
240 }
241 const string errmsg = string("Incorrectly formatted blast hit:\n") + m8;
242 NCBI_THROW(CSplignAppException, eBadData, errmsg);
243 #else
244 return rv;
245 #endif
246 }
247
248
x_GetNextPair(const THitRefs & hitrefs,THitRefs * hitrefs_pair)249 bool CSplignApp::x_GetNextPair(const THitRefs& hitrefs, THitRefs* hitrefs_pair)
250 {
251 USING_SCOPE(objects);
252
253 hitrefs_pair->resize(0);
254
255 const size_t dim = hitrefs.size();
256 if(dim == 0) {
257 return false;
258 }
259
260 if(m_CurHitRef == dim) {
261 m_CurHitRef = numeric_limits<size_t>::max();
262 return false;
263 }
264
265 if(m_CurHitRef == numeric_limits<size_t>::max()) {
266 m_CurHitRef = 0;
267 }
268
269 CConstRef<CSeq_id> query (hitrefs[m_CurHitRef]->GetQueryId());
270 CConstRef<CSeq_id> subj (hitrefs[m_CurHitRef]->GetSubjId());
271 while(m_CurHitRef < dim
272 && hitrefs[m_CurHitRef]->GetQueryId()->Match(*query)
273 && hitrefs[m_CurHitRef]->GetSubjId()->Match(*subj) )
274 {
275 hitrefs_pair->push_back(hitrefs[m_CurHitRef++]);
276 }
277 return true;
278 }
279
280
x_GetNextPair(istream & ifs,THitRefs * hitrefs)281 bool CSplignApp::x_GetNextPair(istream& ifs, THitRefs* hitrefs)
282 {
283 hitrefs->resize(0);
284
285 if(!m_PendingHits.size() && !ifs ) {
286 return false;
287 }
288
289 if(!m_PendingHits.size()) {
290
291 THit::TId query, subj;
292
293 if(m_firstline.size()) {
294
295 THitRef hitref (s_ReadBlastHit(m_firstline));
296 query = hitref->GetQueryId();
297 subj = hitref->GetSubjId();
298 m_PendingHits.push_back(hitref);
299 }
300
301 char buf [1024];
302 while(ifs) {
303
304 buf[0] = 0;
305 CT_POS_TYPE pos0 = ifs.tellg();
306 ifs.getline(buf, sizeof buf, '\n');
307 CT_POS_TYPE pos1 = ifs.tellg();
308 if(pos1 == pos0) break; // GCC hack
309 if(buf[0] == '#') continue; // skip comments
310 const char* p = buf; // skip leading spaces
311 while(*p == ' ' || *p == '\t') ++p;
312 if(*p == 0) continue; // skip empty lines
313
314 THitRef hit (s_ReadBlastHit(p));
315 if(query.IsNull()) {
316 query = hit->GetQueryId();
317 }
318 if(subj.IsNull()) {
319 subj = hit->GetSubjId();
320 }
321 if(hit->GetQueryStrand() == false) {
322 hit->FlipStrands();
323 }
324 if(hit->GetSubjStop() == hit->GetSubjStart()) {
325 // skip single bases
326 continue;
327 }
328
329 if(hit->GetQueryId()->Match(*query) == false ||
330 hit->GetSubjId()->Match(*subj) == false) {
331
332 m_firstline = p;
333 break;
334 }
335
336 m_PendingHits.push_back(hit);
337 }
338 }
339
340 const size_t pending_size = m_PendingHits.size();
341 if(pending_size) {
342
343 THit::TId query = m_PendingHits[0]->GetQueryId();
344 THit::TId subj = m_PendingHits[0]->GetSubjId();
345 size_t i = 1;
346 for(; i < pending_size; ++i) {
347
348 THitRef h = m_PendingHits[i];
349 if(h->GetQueryId()->Match(*query) == false ||
350 h->GetSubjId()->Match(*subj) == false) {
351 break;
352 }
353 }
354 hitrefs->resize(i);
355 copy(m_PendingHits.begin(), m_PendingHits.begin() + i,
356 hitrefs->begin());
357 m_PendingHits.erase(m_PendingHits.begin(), m_PendingHits.begin() + i);
358 }
359
360 return hitrefs->size() > 0;
361 }
362
363
ReadCompartment(istream & istr,CSplign::THitRefs * phitrefs)364 void ReadCompartment(istream& istr, CSplign::THitRefs* phitrefs)
365 {
366 phitrefs->clear();
367 while(istr) {
368 string line;
369 getline(istr, line);
370 if(line.empty()) {
371 if(phitrefs->empty()) continue; else break;
372 }
373 CSplign::THitRef h (new CSplign::THit(line.c_str()));
374 phitrefs->push_back(h);
375 }
376 }
377
378
x_GetNextComp(istream & ifs,THitRefs * phitrefs,THit::TCoord * psubj_min,THit::TCoord * psubj_max)379 bool CSplignApp::x_GetNextComp(istream& ifs,
380 THitRefs* phitrefs,
381 THit::TCoord* psubj_min,
382 THit::TCoord* psubj_max)
383 {
384 static THitRefs hitrefs_next;
385 THitRefs & hitrefs (*phitrefs);
386
387 const THit::TCoord kUndef (numeric_limits<THit::TCoord>::max());
388 const THit::TCoord kMax (numeric_limits<THit::TCoord>::max() - 1);
389 static THit::TCoord smin (kUndef), smax (kUndef);
390
391 if(!hitrefs_next.empty()) {
392 hitrefs.resize(hitrefs_next.size());
393 copy(hitrefs_next.begin(), hitrefs_next.end(), hitrefs.begin());
394 hitrefs_next.clear();
395 }
396 else {
397 // read the first compartment
398 ReadCompartment(ifs, phitrefs);
399 }
400
401 // read the next compartment
402 ReadCompartment(ifs, &hitrefs_next);
403
404 // init coord range - may clarify further
405 if(smin != kUndef) {
406 *psubj_min = smin;
407 *psubj_max = kMax;
408 }
409 else if(smax != kUndef) {
410 *psubj_min = 0;
411 *psubj_max = smax;
412 }
413 else {
414 *psubj_min = 0;
415 *psubj_max = kMax;
416 }
417
418 if(!hitrefs_next.empty()
419 && hitrefs.front()->GetSubjStrand() == hitrefs_next.front()->GetSubjStrand()
420 && hitrefs.front()->GetQueryId()->Match(*(hitrefs_next.front()->GetQueryId()))
421 && hitrefs.front()->GetSubjId()->Match(*(hitrefs_next.front()->GetSubjId())))
422 {
423 {{
424 //check if compartments intersect
425 THit::TCoord min1 = min(hitrefs.front()->GetSubjMin(),
426 hitrefs.back()->GetSubjMin());
427 THit::TCoord min2 = min(hitrefs_next.front()->GetSubjMin(),
428 hitrefs_next.back()->GetSubjMin());
429 THit::TCoord max1 = max(hitrefs.front()->GetSubjMax(),
430 hitrefs.back()->GetSubjMax());
431 THit::TCoord max2 = max(hitrefs_next.front()->GetSubjMax(),
432 hitrefs_next.back()->GetSubjMax());
433 if( max1 > min1 && min2 < max1 ) {
434 CNcbiOstrstream ostr;
435 ostr << "\nTwo compartments intersect for pair "<<hitrefs.front()->GetQueryId()->AsFastaString()
436 <<" "<<hitrefs.front()->GetSubjId()->AsFastaString()
437 << "\n compartment ranges are"
438 <<"\n(" << min1+1 << ", " << max1+1 << ')'
439 <<"\n(" << min2+1 << ", " << max2+1 << ")\n";
440 const string errmsg = CNcbiOstrstreamToString(ostr);
441 NCBI_THROW2(CAlgoAlignException, ePattern, errmsg, eDiag_Fatal);
442 }
443
444 }}
445
446 if(hitrefs.front()->GetSubjStart() < hitrefs_next.front()->GetSubjStart()) {
447 *psubj_min = smin != kUndef? smin: 0;
448 *psubj_max = min(hitrefs_next.front()->GetSubjMin(),
449 hitrefs_next.back()->GetSubjMin());
450 smin = max(hitrefs.front()->GetSubjMax(),
451 hitrefs.back()->GetSubjMax());
452 smax = kUndef;
453 }
454 else {
455 *psubj_min = max(hitrefs_next.front()->GetSubjMax(),
456 hitrefs_next.back()->GetSubjMax());
457 *psubj_max = smax != kUndef? smax: kMax;
458 smin = kUndef;
459 smax = min(hitrefs.front()->GetSubjMin(),
460 hitrefs.back()->GetSubjMin());
461 }
462 }
463 else {
464 smin = smax = kUndef;
465 }
466
467 return !hitrefs.empty();
468 }
469
470
x_LogCompartmentStatus(const THit::TId & query,const THit::TId & subj,const CSplign::SAlignedCompartment & ac)471 void CSplignApp::x_LogCompartmentStatus(const THit::TId & query,
472 const THit::TId & subj,
473 const CSplign::SAlignedCompartment & ac)
474 {
475 typedef CSplign::SAlignedCompartment TCompartment;
476
477 switch(ac.m_Status) {
478
479 case TCompartment::eStatus_Ok: {
480
481 if(ac.m_Id == 0) {
482 NCBI_THROW(CSplignAppException, eInternal, "Missing compartment id.");
483 }
484
485 *m_logstream << (ac.m_QueryStrand? '+': '-') << ac.m_Id
486 << '\t' << query->GetSeqIdString(true)
487 << '\t' << subj->GetSeqIdString(true)
488 << '\t' << ac.m_Msg
489 << '\t' << ac.m_Score
490 << endl;
491 }
492 break;
493
494 case TCompartment::eStatus_Error: {
495
496 *m_logstream << '-'
497 << '\t' << query->GetSeqIdString(true)
498 << '\t' << subj->GetSeqIdString(true)
499 << '\t' << ac.m_Msg
500 << '\t' << '-'
501 << endl;
502 }
503 break;
504
505 case TCompartment::eStatus_Empty:
506 break;
507
508 default: {
509 NCBI_THROW(CSplignAppException, eInternal,
510 "Unexpected compartment status.");
511 }
512 }
513
514 }
515
516
517 CRef<blast::CBlastOptionsHandle>
x_SetupBlastOptions(bool use_disc)518 CSplignApp::x_SetupBlastOptions(bool use_disc)
519 {
520 USING_SCOPE(blast);
521
522 m_BlastProgram = use_disc? eDiscMegablast: eMegablast;
523
524 CRef<CBlastOptionsHandle> blast_options_handle
525 (CBlastOptionsFactory::Create(m_BlastProgram));
526
527 blast_options_handle->SetDefaults();
528
529 CBlastOptions& blast_opt = blast_options_handle->SetOptions();
530
531 if(!use_disc) {
532
533 const CArgs& args = GetArgs();
534 blast_opt.SetWordSize(args["W"].AsInteger());
535 blast_opt.SetMaskAtHash(true);
536 blast_opt.SetDustFiltering(false);
537 }
538
539 if(blast_options_handle->Validate() == false) {
540 NCBI_THROW(CSplignAppException,
541 eInternal,
542 "Incorrect blast setup");
543 }
544
545 return blast_options_handle;
546 }
547
548
549 enum ERunMode {
550 eNotSet,
551 ePairwise, // single query vs single subj
552 eBatch1, // use external raw blast hits
553 eBatch2 // use pre-computed compartments
554 };
555
556
557
558 const char kSplignLdsDb[] = "splign.lds2db";
559
GetLdsDbDir(const string & fasta_dir)560 string GetLdsDbDir(const string& fasta_dir)
561 {
562 string lds_db_dir = fasta_dir;
563 const char sep = CDirEntry::GetPathSeparator();
564 const size_t fds = fasta_dir.size();
565 if(fds > 0 && fasta_dir[fds-1] != sep) {
566 lds_db_dir += sep;
567 }
568 lds_db_dir += "_SplignLDS2_";
569 return lds_db_dir;
570 }
571
572
x_ReadFastaSetId(const CArgValue & argval,CRef<objects::CScope> scope)573 CRef<objects::CSeq_id> CSplignApp::x_ReadFastaSetId(const CArgValue& argval,
574 CRef<objects::CScope> scope)
575 {
576 USING_SCOPE(objects);
577
578 CRef<ILineReader> line_reader;
579 try {
580 line_reader.Reset(
581 new CMemoryLineReader(new CMemoryFile(argval.AsString()),
582 eTakeOwnership));
583 } catch (...) { // fall back to streams
584 line_reader.Reset(new CStreamLineReader(argval.AsInputFile()));
585 }
586 CFastaReader fasta_reader(* line_reader,
587 CFastaReader::fAssumeNuc | CFastaReader::fOneSeq);
588 CConstRef<CSeq_entry> se (fasta_reader.ReadOneSeq());
589
590 scope->AddTopLevelSeqEntry(*se);
591 const CSeq_entry::TSeq& bioseq = se->GetSeq();
592 const CSeq_entry::TSeq::TId& seqid = bioseq.GetId();
593 return seqid.back();
594 }
595
596
Run()597 int CSplignApp::Run()
598 {
599 USING_SCOPE(objects);
600
601 const CArgs & args (GetArgs());
602
603 // check that modes aren't mixed
604
605 if( args["query"] && args["query_id"] ) {
606 NCBI_THROW(CSplignAppException,
607 eBadParameter,
608 "\"-query\" is not compatible with \"-query_id\". "
609 "Specify -help to print arguments." );
610 }
611
612 if( args["subj"] && args["subj_id"] ) {
613 NCBI_THROW(CSplignAppException,
614 eBadParameter,
615 "\"-subj\" is not compatible with \"-subj_id\". "
616 "Specify -help to print arguments." );
617 }
618
619 const bool is_mklds = args["mklds"];
620
621 const bool is_hits = args["hits"];
622 const bool is_query = args["query"] || args["query_id"];
623 const bool is_subj = args["subj"] || args["subj_id"];
624 const bool is_comps = args["comps"];
625
626 const bool use_disc_megablast (args["disc"]);
627
628 if (is_mklds) {
629 // create LDS DB and exit
630 string fa_dir = args["mklds"].AsString();
631 if(CDirEntry::IsAbsolutePath(fa_dir) == false) {
632 string curdir = CDir::GetCwd();
633 const char sep = CDirEntry::GetPathSeparator();
634 const size_t curdirsize = curdir.size();
635 if(curdirsize && curdir[curdirsize-1] != sep) {
636 curdir += sep;
637 }
638 fa_dir = curdir + fa_dir;
639 }
640
641 const string lds_db_dir (GetLdsDbDir(fa_dir));
642 CDir dir(lds_db_dir);
643 dir.Create();
644 string db_file = CDirEntry::ConcatPath(lds_db_dir, kSplignLdsDb);
645 CLDS2_Manager ldsmgr(db_file);
646 ldsmgr.AddDataDir(fa_dir, CLDS2_Manager::eDir_Recurse);
647 ldsmgr.UpdateData();
648
649 return 0;
650 }
651
652 // determine mode and verify arguments
653 ERunMode run_mode (eNotSet);
654
655 if(is_query && is_subj && !(is_hits || is_comps)) {
656 run_mode = ePairwise;
657 }
658 else if(is_hits && !(is_comps ||is_query || is_subj)) {
659 run_mode = eBatch1;
660 }
661 else if(is_comps && !(is_hits ||is_query || is_subj)) {
662 run_mode = eBatch2;
663 }
664
665 if(run_mode == eNotSet) {
666 NCBI_THROW(CSplignAppException,
667 eBadParameter,
668 "Incomplete or inconsistent set of arguments specified. "
669 "Specify -help to print arguments." );
670 }
671
672 // open log stream
673 m_logstream = & args["log"].AsOutputFile();
674
675 // open asn output stream, if any
676 m_AsnOut = args["asn"]? & args["asn"].AsOutputFile(): NULL;
677
678 // open paiwise alignment output stream, if any
679 m_AlnOut = args["aln"]? & args["aln"].AsOutputFile(): NULL;
680
681 // in pairwise mode, setup blast options
682 if(run_mode == ePairwise) {
683 m_BlastOptionsHandle = x_SetupBlastOptions(use_disc_megablast);
684 }
685
686 // splign and formatter setup
687 m_Splign.Reset(new CSplign);
688 CSplignArgUtil::ArgsToSplign(m_Splign, args);
689
690 m_Splign->SetStartModelId(1);
691
692 // splign formatter object
693 m_Formatter.Reset(new CSplignFormatter(*m_Splign));
694
695 // add loaders
696 CRef<CScope> scope;
697 scope.Reset (new CScope(*m_ObjMgr));
698
699
700 {{
701 if(args["genbank"]) {
702 CGBDataLoader::RegisterInObjectManager(*m_ObjMgr);
703 }
704
705 int priority = 1;
706
707 typedef vector<string> TDbs;
708 TDbs dbs;
709
710 string blastdbstr;
711 if (args["blastdb"].HasValue()) {
712 blastdbstr = args["blastdb"].AsString();
713 }
714 dbs.clear();
715 NStr::Split(blastdbstr, ",", dbs);
716 ITERATE(TDbs, i, dbs) {
717 string db = NStr::TruncateSpaces(*i);
718 if (db.empty()) {
719 continue;
720 }
721 CBlastDbDataLoader::EDbType dbtype(CBlastDbDataLoader::eUnknown);
722 if (NStr::StartsWith(db, "na:", NStr::eNocase)) {
723 db.erase(0, 3);
724 dbtype = CBlastDbDataLoader::eNucleotide;
725 } else if (NStr::StartsWith(db, "aa:", NStr::eNocase)) {
726 db.erase(0, 3);
727 dbtype = CBlastDbDataLoader::eProtein;
728 }
729 CBlastDbDataLoader::RegisterInObjectManager(*m_ObjMgr,db, dbtype, true,CObjectManager::eDefault,priority);
730 LOG_POST(Info << "added loader: BlastDB: "
731 << (dbtype == CBlastDbDataLoader::eNucleotide ?
732 "nucl" : "protein")
733 << ": " << db
734 << " (" << priority << ")");
735 ++priority;
736 }
737
738 string ldsdbstr;
739 if (args["ldsdir"].HasValue()) {
740 ldsdbstr = args["ldsdir"].AsString();
741 }
742 dbs.clear();
743 NStr::Split(ldsdbstr, ",", dbs);
744 ITERATE(TDbs, i, dbs) {
745 const string fasta_dir = *i;
746 string db = GetLdsDbDir(fasta_dir);
747 db = NStr::TruncateSpaces(db);
748
749 if (db.empty()) {
750 continue;
751 }
752 db = CDirEntry::CreateAbsolutePath(db);
753 db = CDirEntry::NormalizePath(db);
754
755 string db_file = CDirEntry::ConcatPath(db, kSplignLdsDb);
756 CLDS2_DataLoader::RegisterInObjectManager(*m_ObjMgr, db_file, -1, CObjectManager::eDefault, priority);
757 LOG_POST(Info << "added loader: LDS: " << db << " (" << priority << ")");
758 ++priority;
759 }
760 }}
761
762 scope->AddDefaults();
763 m_Splign->SetScope() = scope;
764
765 // run splign in selected mode
766 if(run_mode == ePairwise) {
767 //query
768 CRef<CSeq_id> seqid_query;
769 if(args["query"]) {
770 seqid_query = x_ReadFastaSetId(args["query"], scope);
771 } else if(args["query_id"]) {
772 const string& id_list = args["query_id"].AsString();
773 try {
774 CBioseq::TId ids;
775 CSeq_id::ParseFastaIds(ids, id_list);
776 seqid_query = FindBestChoice(ids, objects::CSeq_id::Score);
777 if(!seqid_query) seqid_query = new CSeq_id(CSeq_id::e_Local, id_list);
778
779 } catch(CException) {
780 seqid_query.Reset (new CSeq_id(CSeq_id::e_Local, id_list));
781 }
782 } else {
783 NCBI_THROW(CSplignAppException,
784 eBadParameter,
785 "Query is missing. "
786 "Specify -help to print arguments." );
787 }
788
789 //subject
790 CRef<CSeq_id> seqid_subj;
791 if(args["subj"]) {
792 seqid_subj = x_ReadFastaSetId(args["subj"], scope);
793 } else if(args["subj_id"]) {
794 const string& id_list = args["subj_id"].AsString();
795 try {
796 CBioseq::TId ids;
797 CSeq_id::ParseFastaIds(ids, id_list);
798 seqid_subj = FindBestChoice(ids, objects::CSeq_id::Score);
799 if(!seqid_subj) seqid_subj = new CSeq_id(CSeq_id::e_Local, id_list);
800
801 } catch(CException) {
802 seqid_subj.Reset (new CSeq_id(CSeq_id::e_Local, id_list));
803 }
804 } else {
805 NCBI_THROW(CSplignAppException,
806 eBadParameter,
807 "Subject is missing. "
808 "Specify -help to print arguments." );
809 }
810
811 THitRefs hitrefs;
812 x_GetBl2SeqHits(seqid_query, seqid_subj, scope, &hitrefs);
813 x_ProcessPair(hitrefs, args);
814 }
815 else if (run_mode == eBatch1) {
816
817 THitRefs hitrefs;
818 CNcbiIstream& hit_stream = args["hits"].AsInputFile();
819 while(x_GetNextPair(hit_stream, &hitrefs) ) {
820 x_ProcessPair(hitrefs, args);
821 }
822 }
823 else if (run_mode == eBatch2) {
824
825 CNcbiIstream& hit_stream (args["comps"].AsInputFile());
826 THitRefs hitrefs;
827 THit::TCoord subj_min, subj_max;
828
829 while(x_GetNextComp(hit_stream, &hitrefs, &subj_min, &subj_max) ) {
830
831 if(hitrefs.front()->GetScore() > 0) {
832 x_ProcessPair(hitrefs, args, subj_min, subj_max);
833 }
834 }
835 }
836 else {
837 NCBI_THROW(CSplignAppException,
838 eInternal,
839 "Mode not implemented");
840 }
841
842 cout << "# END" << endl;
843
844 return 0;
845 }
846
847
x_GetBl2SeqHits(CRef<objects::CSeq_id> seqid_query,CRef<objects::CSeq_id> seqid_subj,CRef<objects::CScope> scope,THitRefs * phitrefs)848 void CSplignApp::x_GetBl2SeqHits(
849 CRef<objects::CSeq_id> seqid_query,
850 CRef<objects::CSeq_id> seqid_subj,
851 CRef<objects::CScope> scope,
852 THitRefs* phitrefs)
853 {
854 USING_SCOPE(blast);
855 USING_SCOPE(objects);
856
857 phitrefs->resize(0);
858 phitrefs->reserve(100);
859
860 CRef<CSeq_loc> seqloc_query (new CSeq_loc);
861 seqloc_query->SetWhole().Assign(*seqid_query);
862 CRef<CSeq_loc> seqloc_subj (new CSeq_loc);
863 seqloc_subj->SetWhole().Assign(*seqid_subj);
864
865 CBl2Seq Blast( SSeqLoc(seqloc_query.GetNonNullPointer(),
866 scope.GetNonNullPointer()),
867 SSeqLoc(seqloc_subj.GetNonNullPointer(),
868 scope.GetNonNullPointer()),
869 m_BlastProgram);
870
871 Blast.SetOptionsHandle() = *m_BlastOptionsHandle;
872
873 TSeqAlignVector blast_output (Blast.Run());
874
875 ITERATE(TSeqAlignVector, ii, blast_output) {
876 if((*ii)->IsSet()) {
877 const CSeq_align_set::Tdata &sas0 = (*ii)->Get();
878 ITERATE(CSeq_align_set::Tdata, sa_iter, sas0) {
879 CRef<CBlastTabular> hitref (new CBlastTabular(**sa_iter));
880 if(hitref->GetQueryStrand() == false) {
881 hitref->FlipStrands();
882 }
883 phitrefs->push_back(hitref);
884 }
885 }
886 }
887 }
888
x_RunSplign(bool raw_hits,THitRefs * phitrefs,THit::TCoord smin,THit::TCoord smax,CSplign::TResults * psplign_results)889 void CSplignApp::x_RunSplign(bool raw_hits, THitRefs* phitrefs,
890 THit::TCoord smin, THit::TCoord smax,
891 CSplign::TResults * psplign_results)
892 {
893 if(raw_hits) {
894 m_Splign->Run(phitrefs);
895 const CSplign::TResults& results (m_Splign->GetResult());
896 copy(results.begin(), results.end(), back_inserter(*psplign_results));
897 }
898 else {
899 CSplign::SAlignedCompartment ac;
900 m_Splign->AlignSingleCompartment(phitrefs, smin, smax, &ac);
901 psplign_results->push_back(ac);
902 }
903 }
904
905
906 // get the number of non-consensus splices in a compartment
907 // with the highest match count
GetNonConsensusSpliceCount(const CSplign::TResults & splign_results)908 size_t GetNonConsensusSpliceCount(const CSplign::TResults & splign_results)
909 {
910 size_t top_matches (0);
911 size_t rv (0);
912 ITERATE(CSplign::TResults, ii, splign_results) {
913
914 const CSplign::SAlignedCompartment & ac (*ii);
915 size_t matches (0), nc_count(0);
916 typedef CSplign::TSegments::const_iterator TIterator;
917 char dnr [] = {0, 0, 0};
918 char acc [] = {0, 0, 0};
919 size_t exon_count (0);
920
921 for(TIterator jjb (ac.m_Segments.begin()), jje (ac.m_Segments.end()), jj(jjb);
922 jj != jje; ++jj)
923 {
924
925 if(jj->m_exon) {
926
927 const char * p (jj->m_details.data()), * pe (p +jj->m_details.size());
928 int n (-1);
929 for(; p != pe; ++p) {
930 if(*p == 'M') {
931 if(n == 0) ++matches; else if(n > 0) matches += n;
932 n = 0;
933 }
934 else if(isdigit(*p) && n >= 0) {
935 n = n * 10 + *p - '0';
936 }
937 else {
938 if(n == 0) {
939 ++matches;
940 }
941 n = -1;
942 }
943 }
944 if(n == 0) ++matches; else if(n > 0) matches += n;
945
946 if(exon_count > 0) {
947
948 if(jj->m_annot[2] == '<') {
949
950 acc[0] = jj->m_annot[0];
951 acc[1] = jj->m_annot[1];
952
953 if(!CNWFormatter::SSegment::s_IsConsensusSplice(dnr, acc)) {
954 ++nc_count;
955 }
956 }
957 acc[0] = acc[1] = 0;
958 }
959
960 p = jj->m_annot.data();
961 while(*p++ != '>');
962 dnr[0] = *p++;
963 dnr[1] = *p;
964
965 ++exon_count;
966 }
967 }
968
969 if(matches > top_matches) {
970 rv = nc_count;
971 top_matches = matches;
972 }
973 }
974
975 return rv;
976 }
977
978
979 struct SComplement
980 {
operator ()SComplement981 char operator() (char c) {
982 switch(c) {
983 case 'A': return 'T';
984 case 'G': return 'C';
985 case 'T': return 'A';
986 case 'C': return 'G';
987 }
988 return c;
989 }
990 };
991
992
x_ProcessPair(THitRefs & hitrefs,const CArgs & args,THit::TCoord smin,THit::TCoord smax)993 void CSplignApp::x_ProcessPair(THitRefs& hitrefs, const CArgs& args,
994 THit::TCoord smin, THit::TCoord smax)
995 {
996
997 const int flags (CSplignFormatter::eTF_NoExonScores | CSplignFormatter::eTF_UseFastaStyleIds);
998
999 const bool raw_hits (!args["comps"]);
1000
1001 if(hitrefs.size() == 0) {
1002 return;
1003 }
1004
1005 // skip void compartments but obey their bounds
1006 if(hitrefs.front()->GetScore() < 0) {
1007 return;
1008 }
1009
1010 // MASK TEST
1011 {{
1012 CSeq_id_Handle qidh = CSeq_id_Handle::GetHandle(*(hitrefs.front()->GetQueryId()));
1013 CSplign::TSeqRangeColl MaskRanges;
1014 if(args["mask_ranges"].HasValue()) {
1015 try {
1016 string mask_ranges_arg = args["mask_ranges"].AsString();
1017 list<string> range_strs;
1018 NStr::Split(mask_ranges_arg, ",", range_strs);
1019 ITERATE(list<string>, range_iter, range_strs) {
1020 list<string> bound_strs;
1021 NStr::Split(*range_iter, "-", bound_strs);
1022 TSeqRange Curr(NStr::StringToNumeric<TSeqPos>(bound_strs.front())-1,
1023 NStr::StringToNumeric<TSeqPos>(bound_strs.back())-1);
1024 MaskRanges += Curr;
1025 }
1026 } catch(CException& e) {
1027 ERR_POST("Masking Ranges parse error: " + e.ReportAll());
1028 }
1029 }
1030 if(qidh && !MaskRanges.empty()) {
1031 m_Splign->SetHardMaskRanges(qidh, MaskRanges);
1032 }
1033 }}
1034
1035 THit::TId query (hitrefs.front()->GetQueryId());
1036 THit::TId subj (hitrefs.front()->GetSubjId());
1037
1038 m_Formatter->SetSeqIds(query, subj);
1039
1040 string strand (args["direction"].AsString());
1041
1042 if(strand == kDirDefault) {
1043 strand = (args["type"].AsString() == kQueryType_mRNA)? kDirAuto: kDirBoth;
1044 }
1045
1046 CSplign::TResults splign_results;
1047
1048 if(strand == kDirSense) {
1049
1050 m_Splign->SetStrand(true);
1051 x_RunSplign(raw_hits, &hitrefs, smin, smax, &splign_results);
1052 }
1053 else if(strand == kDirAntisense) {
1054
1055 m_Splign->SetStrand(false);
1056 x_RunSplign(raw_hits, &hitrefs, smin, smax, &splign_results);
1057 }
1058 else if(strand == kDirBoth) {
1059
1060 // save original hits
1061 THitRefs hits0;
1062 ITERATE(THitRefs, ii, hitrefs) {
1063 const THitRef & h0 (*ii);
1064 THitRef h1 (new THit (*h0));
1065 hits0.push_back(h1);
1066 }
1067
1068 static size_t mid (1);
1069 size_t mid_plus, mid_minus;
1070 {{
1071 m_Splign->SetStrand(true);
1072 m_Splign->SetStartModelId(mid);
1073 x_RunSplign(raw_hits, &hitrefs, smin, smax, &splign_results);
1074 mid_plus = m_Splign->GetNextModelId();
1075 }}
1076 {{
1077 m_Splign->SetStrand(false);
1078 m_Splign->SetStartModelId(mid);
1079 x_RunSplign(raw_hits, &hits0, smin, smax, &splign_results);
1080 mid_minus = m_Splign->GetNextModelId();
1081 }}
1082 mid = max(mid_plus, mid_minus);
1083 }
1084 else {
1085
1086 // 'auto' means to align both directions when in doubt
1087
1088 THitRefs hits0;
1089 ITERATE(THitRefs, ii, hitrefs) {
1090 const THitRef & h0 (*ii);
1091 THitRef h1 (new THit (*h0));
1092 hits0.push_back(h1);
1093 }
1094
1095 // determine the direction with the longest ORF
1096 const CSplign::TOrfPair orfs (m_Splign->GetCds(hitrefs.front()->GetQueryId()));
1097 const size_t orf_sense (orfs.first.second - orfs.first.first);
1098 const size_t orf_antisense (orfs.second.first - orfs.second.second);
1099 const bool sense_first (orf_sense >= orf_antisense);
1100
1101 static size_t mid (1);
1102 size_t mid_first, mid_second;
1103
1104 // align in the longest ORF direction
1105 m_Splign->SetStrand(sense_first);
1106 m_Splign->SetStartModelId(mid);
1107 x_RunSplign(raw_hits, &hitrefs, smin, smax, &splign_results);
1108 mid_first = m_Splign->GetNextModelId();
1109
1110 // if there is a non-consensus splice, also align in the opposite direction
1111 const size_t nc_count (GetNonConsensusSpliceCount(splign_results));
1112
1113 // same if there is a poly-a in the opposite direction
1114 bool polya_found (false);
1115 if(nc_count == 0) {
1116 CRef<CScope> scope (m_Splign->GetScope());
1117 CConstRef<CSeq_id> seqid_query (hits0.front()->GetQueryId());
1118 CBioseq_Handle bh (scope->GetBioseqHandle(*seqid_query));
1119 CSeqVector sv (bh.GetSeqVector(CBioseq_Handle
1120 ::eCoding_Iupac));
1121 string str;
1122 sv.GetSeqData(0, sv.size(), str);
1123 if(sense_first) {
1124 reverse (str.begin(), str.end());
1125 transform(str.begin(), str.end(), str.begin(), SComplement());
1126 }
1127 const size_t polya (CSplign::s_TestPolyA(str.data(), str.size()));
1128 polya_found = (0 < polya && polya < str.size());
1129 }
1130
1131 if(nc_count > 0 || polya_found) {
1132 m_Splign->SetStrand(!sense_first);
1133 m_Splign->SetStartModelId(mid);
1134 x_RunSplign(raw_hits, &hits0, smin, smax, &splign_results);
1135 mid_second = m_Splign->GetNextModelId();
1136 mid = max(mid_first, mid_second);
1137 }
1138 else {
1139 mid = mid_first;
1140 }
1141 }
1142
1143 cout << m_Formatter->AsExonTable(&splign_results, flags);
1144
1145 if(m_AsnOut) {
1146
1147 const CSplignFormatter::EAsnFlags flags
1148 = CSplignFormatter::EAsnFlags(CSplignFormatter::eAF_SplicedSegWithParts);
1149
1150 CRef<CSeq_align_set> sas (m_Formatter->AsSeqAlignSet(&splign_results,
1151 flags));
1152
1153 CScoreBuilderBase score_builder;
1154 NON_CONST_ITERATE (CSeq_align_set::Tdata, align_it, sas->Set()) {
1155 score_builder.AddScore(*m_Splign->GetScope(), **align_it,
1156 CSeq_align::eScore_ConsensusSplices);
1157 score_builder.AddScore(*m_Splign->GetScope(), **align_it,
1158 CSeq_align::eScore_Splices);
1159 }
1160
1161
1162 *m_AsnOut << MSerial_AsnText << *sas << endl;
1163 }
1164
1165 if(m_AlnOut) {
1166 *m_AlnOut << m_Formatter->AsAlignmentText(m_Splign->GetScope(),
1167 &splign_results);
1168 }
1169
1170 ITERATE(CSplign::TResults, ii, splign_results) {
1171 x_LogCompartmentStatus(query, subj, *ii);
1172 }
1173 }
1174
1175
1176 END_NCBI_SCOPE
1177
1178 /////////////////////////////////////
1179
1180 USING_NCBI_SCOPE;
1181
main(int argc,const char * argv[])1182 int main(int argc, const char* argv[])
1183 {
1184 const int rv (CSplignApp().AppMain(argc, argv));
1185 return rv;
1186 }
1187