1 /* $Id: splign_app.cpp 592172 2019-08-27 19:18:46Z vasilche $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Yuri Kapustin
27  *
28  * File Description: Splign application
29  *
30 */
31 
32 
33 #include <ncbi_pch.hpp>
34 
35 #include "splign_app.hpp"
36 #include "splign_app_exception.hpp"
37 
38 #include <corelib/ncbistd.hpp>
39 #include <corelib/ncbi_system.hpp>
40 
41 #include <serial/objostrasn.hpp>
42 #include <serial/serial.hpp>
43 
44 #include <algo/align/nw/align_exception.hpp>
45 #include <algo/align/nw/nw_spliced_aligner16.hpp>
46 #include <algo/align/splign/splign_cmdargs.hpp>
47 #include <algo/align/util/hit_comparator.hpp>
48 
49 #include <algo/blast/api/bl2seq.hpp>
50 #include <algo/blast/api/local_blast.hpp>
51 #include <algo/blast/api/objmgr_query_data.hpp>
52 #include <algo/blast/api/local_db_adapter.hpp>
53 
54 #include <objmgr/seq_vector.hpp>
55 
56 #include <objects/seq/Bioseq.hpp>
57 #include <objects/seqloc/Seq_loc.hpp>
58 
59 #include <objects/seqalign/Score_set.hpp>
60 #include <objects/seqalign/Score.hpp>
61 
62 #include <objtools/readers/fasta.hpp>
63 #include <objtools/readers/reader_exception.hpp>
64 #include <objtools/data_loaders/blastdb/bdbloader.hpp>
65 
66 #include <objtools/data_loaders/genbank/gbloader.hpp>
67 
68 #include <objtools/lds2/lds2.hpp>
69 #include <objtools/data_loaders/lds2/lds2_dataloader.hpp>
70 
71 #include <objtools/alnmgr/score_builder_base.hpp>
72 
73 #include <algorithm>
74 #include <memory>
75 
76 namespace {
77     const char kDirSense[]     = "sense";
78     const char kDirAntisense[] = "antisense";
79     const char kDirBoth[]      = "both";
80     const char kDirAuto[]      = "auto";
81     const char kDirDefault[]   = "default";
82 }
83 
84 
85 BEGIN_NCBI_SCOPE
86 
CSplignApp(void)87 CSplignApp::CSplignApp(void)
88 {
89     auto version = Ref(&CSplign::s_GetVersion());
90     SetFullVersion(version);
91     m_AppName = version->Print("Splign");
92 }
93 
94 
Init()95 void CSplignApp::Init()
96 {
97     HideStdArgs(fHideHelp    | fHideLogfile | fHideConffile |
98                 fHideVersion | fHideFullVersion | fHideDryRun  |
99                 fHideXmlHelp | fHideFullHelp);
100 
101     auto_ptr<CArgDescriptions> argdescr(new CArgDescriptions);
102     argdescr->SetUsageContext(GetArguments().GetProgramName(), m_AppName);
103 
104     argdescr->AddOptionalKey
105         ("hits", "hits",
106          "[Batch mode] Externally computed local alignments "
107          "(such as blast hits), in blast tabular format. "
108          "The file must be collated by subject and query "
109          "(e.g. sort -k 2,2 -k 1,1).",
110          CArgDescriptions::eInputFile);
111 
112     argdescr->AddOptionalKey
113         ("comps", "comps",
114          "[Batch mode] Compartments computed with Compart utility.",
115          CArgDescriptions::eInputFile);
116 
117     argdescr->AddOptionalKey
118         ("mklds", "mklds",
119          "Make LDS DB under the specified directory "
120          "with cDNA and genomic FASTA files or symlinks.",
121          CArgDescriptions::eString);
122 
123     argdescr->AddOptionalKey
124         ("blastdb", "blastdb",
125          "Comma separated list of Blast DBs for sequence retrieval.",
126          CArgDescriptions::eString);
127 
128     argdescr->AddOptionalKey
129         ("ldsdir", "ldsdir",
130          "Comma separated list of directories with LDS DBs.",
131          CArgDescriptions::eString);
132 
133     argdescr->AddFlag
134         ("genbank",
135          "Add genbank loader. Not recommended in Batch mode.");
136 
137     argdescr->AddOptionalKey
138         ("query", "query",
139          "[Pairwise mode] FASTA file with the spliced sequence. "
140          "Not compatible with \"-query_id\".",
141          CArgDescriptions::eInputFile);
142 
143     argdescr->AddOptionalKey
144         ("query_id", "query_id",
145          "[Pairwise mode] Query id. Not compatible with \"-query\".",
146          CArgDescriptions::eString);
147 
148     argdescr->AddOptionalKey
149         ("subj", "subj",
150          "[Pairwise mode] FASTA file with the genomic sequence. "
151          "Not compatible with \"-subj_id\".",
152          CArgDescriptions::eInputFile);
153 
154     argdescr->AddOptionalKey
155         ("subj_id", "subj_id",
156          "[Pairwise mode] Subject id. Not compatible with \"-subj\".",
157          CArgDescriptions::eString);
158 
159     argdescr->AddFlag
160         ("disc",
161          "[Pairwise mode] Use discontiguous megablast to facilitate "
162          "alignment of more divergent sequences such as those "
163          "from different organisms (cross-species alignment).");
164 
165     argdescr->AddDefaultKey
166         ("W", "mbwordsize", "[Pairwise mode] Megablast word size",
167          CArgDescriptions::eInteger,
168          "28");
169 
170     argdescr->AddOptionalKey
171         ("mask_ranges", "mask_ranges", "[Pairwise mode] regions to mask out, query only, formatted as '1-2,5-8,33-44', 1-based",
172          CArgDescriptions::eString);
173 
174 
175     CSplignArgUtil::SetupArgDescriptions(argdescr.get());
176 
177     argdescr->AddDefaultKey
178         ("direction",
179          "direction",
180          "Query sequence orientation. "
181          "Auto orientation begins with the longest ORF direction (d1) "
182          "and proceeds with the opposite direction (d2) "
183          "if found a non-consensus splice in d1 or poly-a tail in d2. "
184 
185          "Default translates to 'auto' in mRNA and "
186          "'both' in EST mode",
187          CArgDescriptions::eString,   kDirDefault
188          );
189 
190     argdescr->AddDefaultKey("log", "log", "Splign log file",
191                             CArgDescriptions::eOutputFile,
192                             "splign.log");
193 
194     argdescr->AddOptionalKey("asn", "asn", "ASN.1 output file name",
195                              CArgDescriptions::eOutputFile);
196 
197     argdescr->AddOptionalKey("aln", "aln", "Pairwise alignment output file name",
198                              CArgDescriptions::eOutputFile);
199 
200     CArgAllow_Strings * constrain_direction (new CArgAllow_Strings);
201     constrain_direction
202         ->Allow(kDirDefault)
203         ->Allow(kDirSense)
204         ->Allow(kDirAntisense)
205         ->Allow(kDirBoth)
206         ->Allow(kDirAuto);
207 
208     argdescr->SetConstraint("direction", constrain_direction);
209 
210     SetupArgDescriptions(argdescr.release());
211 
212     m_ObjMgr = CObjectManager::GetInstance();
213 }
214 
215 
s_ReadBlastHit(const string & m8)216 CSplign::THitRef CSplignApp::s_ReadBlastHit(const string& m8)
217 {
218     THitRef rv (new CBlastTabular(m8.c_str()));
219 
220 #ifdef SPLIGNAPP_UNDECORATED_ARE_LOCALS
221     // make seq-id local if no type specified in the original m8
222     string::const_iterator ie = m8.end(), i0 = m8.begin(), i1 = i0;
223     while(i1 != ie && *i1 !='\t') ++i1;
224     if(i1 != ie) {
225         string::const_iterator i2 = ++i1;
226         while(i2 != ie && *i2 !='\t') ++i2;
227         if(i2 != ie) {
228             if(find(i0, i1, '|') == i1) {
229                 const string strid = rv->GetQueryId()->GetSeqIdString(true);
230                 CRef<CSeq_id> seqid (new CSeq_id(CSeq_id::e_Local, strid));
231                 rv->SetQueryId(seqid);
232             }
233             if(find(i1, i2, '|') == i2) {
234                 const string strid = rv->GetSubjId()->GetSeqIdString(true);
235                 CRef<CSeq_id> seqid (new CSeq_id(CSeq_id::e_Local, strid));
236                 rv->SetSubjId(seqid);
237             }
238             return rv;
239         }
240     }
241     const string errmsg = string("Incorrectly formatted blast hit:\n") + m8;
242     NCBI_THROW(CSplignAppException, eBadData, errmsg);
243 #else
244     return rv;
245 #endif
246 }
247 
248 
x_GetNextPair(const THitRefs & hitrefs,THitRefs * hitrefs_pair)249 bool CSplignApp::x_GetNextPair(const THitRefs& hitrefs, THitRefs* hitrefs_pair)
250 {
251     USING_SCOPE(objects);
252 
253     hitrefs_pair->resize(0);
254 
255     const size_t dim = hitrefs.size();
256     if(dim == 0) {
257         return false;
258     }
259 
260     if(m_CurHitRef == dim) {
261         m_CurHitRef = numeric_limits<size_t>::max();
262         return false;
263     }
264 
265     if(m_CurHitRef == numeric_limits<size_t>::max()) {
266         m_CurHitRef = 0;
267     }
268 
269     CConstRef<CSeq_id> query (hitrefs[m_CurHitRef]->GetQueryId());
270     CConstRef<CSeq_id> subj  (hitrefs[m_CurHitRef]->GetSubjId());
271     while(m_CurHitRef < dim
272           && hitrefs[m_CurHitRef]->GetQueryId()->Match(*query)
273            && hitrefs[m_CurHitRef]->GetSubjId()->Match(*subj)  )
274     {
275         hitrefs_pair->push_back(hitrefs[m_CurHitRef++]);
276     }
277     return true;
278 }
279 
280 
x_GetNextPair(istream & ifs,THitRefs * hitrefs)281 bool CSplignApp::x_GetNextPair(istream& ifs, THitRefs* hitrefs)
282 {
283     hitrefs->resize(0);
284 
285     if(!m_PendingHits.size() && !ifs ) {
286         return false;
287     }
288 
289     if(!m_PendingHits.size()) {
290 
291         THit::TId query, subj;
292 
293         if(m_firstline.size()) {
294 
295             THitRef hitref (s_ReadBlastHit(m_firstline));
296             query = hitref->GetQueryId();
297             subj  = hitref->GetSubjId();
298             m_PendingHits.push_back(hitref);
299         }
300 
301         char buf [1024];
302         while(ifs) {
303 
304             buf[0] = 0;
305             CT_POS_TYPE pos0 = ifs.tellg();
306             ifs.getline(buf, sizeof buf, '\n');
307             CT_POS_TYPE pos1 = ifs.tellg();
308             if(pos1 == pos0) break; // GCC hack
309             if(buf[0] == '#') continue; // skip comments
310             const char* p = buf; // skip leading spaces
311             while(*p == ' ' || *p == '\t') ++p;
312             if(*p == 0) continue; // skip empty lines
313 
314             THitRef hit (s_ReadBlastHit(p));
315             if(query.IsNull()) {
316                 query = hit->GetQueryId();
317             }
318             if(subj.IsNull()) {
319                 subj = hit->GetSubjId();
320             }
321             if(hit->GetQueryStrand() == false) {
322                 hit->FlipStrands();
323             }
324             if(hit->GetSubjStop() == hit->GetSubjStart()) {
325                 // skip single bases
326                 continue;
327             }
328 
329             if(hit->GetQueryId()->Match(*query) == false ||
330                hit->GetSubjId()->Match(*subj) == false) {
331 
332                 m_firstline = p;
333                 break;
334             }
335 
336             m_PendingHits.push_back(hit);
337         }
338     }
339 
340     const size_t pending_size = m_PendingHits.size();
341     if(pending_size) {
342 
343         THit::TId query = m_PendingHits[0]->GetQueryId();
344         THit::TId subj  = m_PendingHits[0]->GetSubjId();
345         size_t i = 1;
346         for(; i < pending_size; ++i) {
347 
348             THitRef h = m_PendingHits[i];
349             if(h->GetQueryId()->Match(*query) == false ||
350                h->GetSubjId()->Match(*subj) == false) {
351                 break;
352             }
353         }
354         hitrefs->resize(i);
355         copy(m_PendingHits.begin(), m_PendingHits.begin() + i,
356              hitrefs->begin());
357         m_PendingHits.erase(m_PendingHits.begin(), m_PendingHits.begin() + i);
358     }
359 
360     return hitrefs->size() > 0;
361 }
362 
363 
ReadCompartment(istream & istr,CSplign::THitRefs * phitrefs)364 void ReadCompartment(istream& istr, CSplign::THitRefs* phitrefs)
365 {
366     phitrefs->clear();
367     while(istr) {
368         string line;
369         getline(istr, line);
370         if(line.empty()) {
371             if(phitrefs->empty()) continue; else break;
372         }
373         CSplign::THitRef h (new CSplign::THit(line.c_str()));
374         phitrefs->push_back(h);
375     }
376 }
377 
378 
x_GetNextComp(istream & ifs,THitRefs * phitrefs,THit::TCoord * psubj_min,THit::TCoord * psubj_max)379 bool CSplignApp::x_GetNextComp(istream& ifs,
380                                THitRefs* phitrefs,
381                                THit::TCoord* psubj_min,
382                                THit::TCoord* psubj_max)
383 {
384     static THitRefs hitrefs_next;
385     THitRefs & hitrefs (*phitrefs);
386 
387     const THit::TCoord kUndef (numeric_limits<THit::TCoord>::max());
388     const THit::TCoord kMax (numeric_limits<THit::TCoord>::max() - 1);
389     static THit::TCoord smin (kUndef), smax (kUndef);
390 
391     if(!hitrefs_next.empty()) {
392         hitrefs.resize(hitrefs_next.size());
393         copy(hitrefs_next.begin(), hitrefs_next.end(), hitrefs.begin());
394         hitrefs_next.clear();
395     }
396     else {
397         // read the first compartment
398         ReadCompartment(ifs, phitrefs);
399     }
400 
401     // read the next compartment
402     ReadCompartment(ifs, &hitrefs_next);
403 
404     // init coord range - may clarify further
405     if(smin != kUndef) {
406         *psubj_min = smin;
407         *psubj_max = kMax;
408     }
409     else if(smax != kUndef) {
410         *psubj_min = 0;
411         *psubj_max = smax;
412     }
413     else {
414         *psubj_min = 0;
415         *psubj_max = kMax;
416     }
417 
418     if(!hitrefs_next.empty()
419        && hitrefs.front()->GetSubjStrand() == hitrefs_next.front()->GetSubjStrand()
420        && hitrefs.front()->GetQueryId()->Match(*(hitrefs_next.front()->GetQueryId()))
421        && hitrefs.front()->GetSubjId()->Match(*(hitrefs_next.front()->GetSubjId())))
422     {
423         {{
424           //check if compartments intersect
425                 THit::TCoord min1 = min(hitrefs.front()->GetSubjMin(),
426                        hitrefs.back()->GetSubjMin());
427                 THit::TCoord min2 = min(hitrefs_next.front()->GetSubjMin(),
428                        hitrefs_next.back()->GetSubjMin());
429                 THit::TCoord max1 = max(hitrefs.front()->GetSubjMax(),
430                        hitrefs.back()->GetSubjMax());
431                 THit::TCoord max2 = max(hitrefs_next.front()->GetSubjMax(),
432                        hitrefs_next.back()->GetSubjMax());
433                 if( max1 > min1 && min2 < max1 ) {
434                     CNcbiOstrstream ostr;
435                     ostr << "\nTwo compartments intersect for pair "<<hitrefs.front()->GetQueryId()->AsFastaString()
436                          <<" "<<hitrefs.front()->GetSubjId()->AsFastaString()
437                          << "\n compartment ranges are"
438                          <<"\n(" << min1+1 << ", " << max1+1 << ')'
439                          <<"\n(" << min2+1 << ", " << max2+1 << ")\n";
440                     const string errmsg = CNcbiOstrstreamToString(ostr);
441                     NCBI_THROW2(CAlgoAlignException, ePattern, errmsg, eDiag_Fatal);
442                 }
443 
444         }}
445 
446         if(hitrefs.front()->GetSubjStart() < hitrefs_next.front()->GetSubjStart()) {
447             *psubj_min = smin != kUndef? smin: 0;
448             *psubj_max = min(hitrefs_next.front()->GetSubjMin(),
449                              hitrefs_next.back()->GetSubjMin());
450             smin = max(hitrefs.front()->GetSubjMax(),
451                        hitrefs.back()->GetSubjMax());
452             smax = kUndef;
453         }
454         else {
455             *psubj_min = max(hitrefs_next.front()->GetSubjMax(),
456                              hitrefs_next.back()->GetSubjMax());
457             *psubj_max = smax != kUndef? smax: kMax;
458             smin = kUndef;
459             smax = min(hitrefs.front()->GetSubjMin(),
460                        hitrefs.back()->GetSubjMin());
461         }
462     }
463     else {
464         smin = smax = kUndef;
465     }
466 
467     return !hitrefs.empty();
468 }
469 
470 
x_LogCompartmentStatus(const THit::TId & query,const THit::TId & subj,const CSplign::SAlignedCompartment & ac)471 void CSplignApp::x_LogCompartmentStatus(const THit::TId & query,
472                                         const THit::TId & subj,
473                                         const CSplign::SAlignedCompartment & ac)
474 {
475     typedef CSplign::SAlignedCompartment TCompartment;
476 
477     switch(ac.m_Status) {
478 
479         case TCompartment::eStatus_Ok: {
480 
481             if(ac.m_Id == 0) {
482                 NCBI_THROW(CSplignAppException, eInternal, "Missing compartment id.");
483             }
484 
485             *m_logstream << (ac.m_QueryStrand? '+': '-') << ac.m_Id
486                          << '\t' << query->GetSeqIdString(true)
487                          << '\t' << subj->GetSeqIdString(true)
488                          << '\t' << ac.m_Msg
489                          << '\t' << ac.m_Score
490                          << endl;
491         }
492         break;
493 
494         case TCompartment::eStatus_Error: {
495 
496             *m_logstream << '-'
497                          << '\t' << query->GetSeqIdString(true)
498                          << '\t' << subj->GetSeqIdString(true)
499                          << '\t' << ac.m_Msg
500                          << '\t' << '-'
501                          << endl;
502         }
503         break;
504 
505         case TCompartment::eStatus_Empty:
506         break;
507 
508         default: {
509             NCBI_THROW(CSplignAppException, eInternal,
510                        "Unexpected compartment status.");
511         }
512     }
513 
514 }
515 
516 
517 CRef<blast::CBlastOptionsHandle>
x_SetupBlastOptions(bool use_disc)518 CSplignApp::x_SetupBlastOptions(bool use_disc)
519 {
520     USING_SCOPE(blast);
521 
522     m_BlastProgram = use_disc? eDiscMegablast: eMegablast;
523 
524     CRef<CBlastOptionsHandle> blast_options_handle
525         (CBlastOptionsFactory::Create(m_BlastProgram));
526 
527     blast_options_handle->SetDefaults();
528 
529     CBlastOptions& blast_opt = blast_options_handle->SetOptions();
530 
531     if(!use_disc) {
532 
533         const CArgs& args = GetArgs();
534         blast_opt.SetWordSize(args["W"].AsInteger());
535         blast_opt.SetMaskAtHash(true);
536         blast_opt.SetDustFiltering(false);
537     }
538 
539     if(blast_options_handle->Validate() == false) {
540         NCBI_THROW(CSplignAppException,
541                    eInternal,
542                    "Incorrect blast setup");
543     }
544 
545     return blast_options_handle;
546 }
547 
548 
549 enum ERunMode {
550     eNotSet,
551     ePairwise, // single query vs single subj
552     eBatch1,   // use external raw blast hits
553     eBatch2    // use pre-computed compartments
554 };
555 
556 
557 
558 const char kSplignLdsDb[] = "splign.lds2db";
559 
GetLdsDbDir(const string & fasta_dir)560 string GetLdsDbDir(const string& fasta_dir)
561 {
562     string lds_db_dir = fasta_dir;
563     const char sep = CDirEntry::GetPathSeparator();
564     const size_t fds = fasta_dir.size();
565     if(fds > 0 && fasta_dir[fds-1] != sep) {
566         lds_db_dir += sep;
567     }
568     lds_db_dir += "_SplignLDS2_";
569     return lds_db_dir;
570 }
571 
572 
x_ReadFastaSetId(const CArgValue & argval,CRef<objects::CScope> scope)573 CRef<objects::CSeq_id> CSplignApp::x_ReadFastaSetId(const CArgValue& argval,
574                                                     CRef<objects::CScope> scope)
575 {
576     USING_SCOPE(objects);
577 
578     CRef<ILineReader> line_reader;
579     try {
580         line_reader.Reset(
581             new CMemoryLineReader(new CMemoryFile(argval.AsString()),
582                                   eTakeOwnership));
583     } catch (...) { // fall back to streams
584         line_reader.Reset(new CStreamLineReader(argval.AsInputFile()));
585     }
586     CFastaReader fasta_reader(* line_reader,
587                               CFastaReader::fAssumeNuc | CFastaReader::fOneSeq);
588     CConstRef<CSeq_entry> se (fasta_reader.ReadOneSeq());
589 
590     scope->AddTopLevelSeqEntry(*se);
591     const CSeq_entry::TSeq& bioseq = se->GetSeq();
592     const CSeq_entry::TSeq::TId& seqid = bioseq.GetId();
593     return seqid.back();
594 }
595 
596 
Run()597 int CSplignApp::Run()
598 {
599     USING_SCOPE(objects);
600 
601     const CArgs & args (GetArgs());
602 
603     // check that modes aren't mixed
604 
605     if( args["query"] && args["query_id"] ) {
606         NCBI_THROW(CSplignAppException,
607                    eBadParameter,
608                    "\"-query\" is not compatible with \"-query_id\". "
609                    "Specify -help to print arguments." );
610     }
611 
612     if( args["subj"] && args["subj_id"] ) {
613         NCBI_THROW(CSplignAppException,
614                    eBadParameter,
615                    "\"-subj\" is not compatible with \"-subj_id\". "
616                    "Specify -help to print arguments." );
617     }
618 
619     const bool is_mklds   = args["mklds"];
620 
621     const bool is_hits    = args["hits"];
622     const bool is_query   = args["query"] || args["query_id"];
623     const bool is_subj    = args["subj"] || args["subj_id"];
624     const bool is_comps   = args["comps"];
625 
626     const bool use_disc_megablast (args["disc"]);
627 
628     if (is_mklds) {
629         // create LDS DB and exit
630         string fa_dir = args["mklds"].AsString();
631         if(CDirEntry::IsAbsolutePath(fa_dir) == false) {
632             string curdir = CDir::GetCwd();
633             const char sep = CDirEntry::GetPathSeparator();
634             const size_t curdirsize = curdir.size();
635             if(curdirsize && curdir[curdirsize-1] != sep) {
636                 curdir += sep;
637             }
638             fa_dir = curdir + fa_dir;
639         }
640 
641         const string lds_db_dir (GetLdsDbDir(fa_dir));
642         CDir dir(lds_db_dir);
643         dir.Create();
644         string db_file = CDirEntry::ConcatPath(lds_db_dir, kSplignLdsDb);
645         CLDS2_Manager ldsmgr(db_file);
646         ldsmgr.AddDataDir(fa_dir, CLDS2_Manager::eDir_Recurse);
647         ldsmgr.UpdateData();
648 
649         return 0;
650     }
651 
652     // determine mode and verify arguments
653     ERunMode run_mode (eNotSet);
654 
655     if(is_query && is_subj && !(is_hits || is_comps)) {
656         run_mode = ePairwise;
657     }
658     else if(is_hits && !(is_comps ||is_query || is_subj)) {
659         run_mode = eBatch1;
660     }
661     else if(is_comps && !(is_hits ||is_query || is_subj)) {
662         run_mode = eBatch2;
663     }
664 
665     if(run_mode == eNotSet) {
666         NCBI_THROW(CSplignAppException,
667                    eBadParameter,
668                    "Incomplete or inconsistent set of arguments specified. "
669                    "Specify -help to print arguments." );
670     }
671 
672     // open log stream
673     m_logstream = & args["log"].AsOutputFile();
674 
675     // open asn output stream, if any
676     m_AsnOut = args["asn"]? & args["asn"].AsOutputFile(): NULL;
677 
678     // open paiwise alignment output stream, if any
679     m_AlnOut = args["aln"]? & args["aln"].AsOutputFile(): NULL;
680 
681     // in pairwise mode, setup blast options
682     if(run_mode == ePairwise) {
683         m_BlastOptionsHandle = x_SetupBlastOptions(use_disc_megablast);
684     }
685 
686     // splign and formatter setup
687     m_Splign.Reset(new CSplign);
688     CSplignArgUtil::ArgsToSplign(m_Splign, args);
689 
690     m_Splign->SetStartModelId(1);
691 
692     // splign formatter object
693     m_Formatter.Reset(new CSplignFormatter(*m_Splign));
694 
695     // add loaders
696     CRef<CScope> scope;
697     scope.Reset (new CScope(*m_ObjMgr));
698 
699 
700     {{
701         if(args["genbank"]) {
702             CGBDataLoader::RegisterInObjectManager(*m_ObjMgr);
703         }
704 
705         int priority = 1;
706 
707         typedef vector<string> TDbs;
708         TDbs dbs;
709 
710         string blastdbstr;
711         if (args["blastdb"].HasValue()) {
712             blastdbstr = args["blastdb"].AsString();
713         }
714         dbs.clear();
715         NStr::Split(blastdbstr, ",", dbs);
716         ITERATE(TDbs, i, dbs) {
717             string db = NStr::TruncateSpaces(*i);
718             if (db.empty()) {
719                 continue;
720             }
721             CBlastDbDataLoader::EDbType dbtype(CBlastDbDataLoader::eUnknown);
722             if (NStr::StartsWith(db, "na:", NStr::eNocase)) {
723                 db.erase(0, 3);
724                 dbtype = CBlastDbDataLoader::eNucleotide;
725             } else if (NStr::StartsWith(db, "aa:", NStr::eNocase)) {
726                 db.erase(0, 3);
727                 dbtype = CBlastDbDataLoader::eProtein;
728             }
729             CBlastDbDataLoader::RegisterInObjectManager(*m_ObjMgr,db, dbtype, true,CObjectManager::eDefault,priority);
730             LOG_POST(Info << "added loader: BlastDB: "
731                      << (dbtype == CBlastDbDataLoader::eNucleotide ?
732                      "nucl" : "protein")
733                      << ": " << db
734                      << " (" << priority << ")");
735             ++priority;
736         }
737 
738         string ldsdbstr;
739         if (args["ldsdir"].HasValue()) {
740             ldsdbstr = args["ldsdir"].AsString();
741         }
742         dbs.clear();
743         NStr::Split(ldsdbstr, ",", dbs);
744         ITERATE(TDbs, i, dbs) {
745             const string fasta_dir = *i;
746             string db = GetLdsDbDir(fasta_dir);
747             db = NStr::TruncateSpaces(db);
748 
749             if (db.empty()) {
750                 continue;
751             }
752             db = CDirEntry::CreateAbsolutePath(db);
753             db = CDirEntry::NormalizePath(db);
754 
755             string db_file = CDirEntry::ConcatPath(db, kSplignLdsDb);
756             CLDS2_DataLoader::RegisterInObjectManager(*m_ObjMgr, db_file, -1, CObjectManager::eDefault, priority);
757             LOG_POST(Info << "added loader: LDS: " << db << " (" << priority << ")");
758             ++priority;
759         }
760     }}
761 
762     scope->AddDefaults();
763     m_Splign->SetScope() = scope;
764 
765     // run splign in selected mode
766     if(run_mode == ePairwise) {
767         //query
768         CRef<CSeq_id> seqid_query;
769         if(args["query"]) {
770             seqid_query = x_ReadFastaSetId(args["query"], scope);
771         } else if(args["query_id"]) {
772             const string& id_list = args["query_id"].AsString();
773             try {
774                 CBioseq::TId ids;
775                 CSeq_id::ParseFastaIds(ids, id_list);
776                 seqid_query = FindBestChoice(ids, objects::CSeq_id::Score);
777                 if(!seqid_query) seqid_query = new CSeq_id(CSeq_id::e_Local, id_list);
778 
779             } catch(CException) {
780                 seqid_query.Reset (new CSeq_id(CSeq_id::e_Local, id_list));
781             }
782         } else {
783             NCBI_THROW(CSplignAppException,
784                        eBadParameter,
785                        "Query is missing. "
786                        "Specify -help to print arguments." );
787         }
788 
789         //subject
790         CRef<CSeq_id> seqid_subj;
791         if(args["subj"]) {
792             seqid_subj = x_ReadFastaSetId(args["subj"], scope);
793         } else if(args["subj_id"]) {
794             const string& id_list = args["subj_id"].AsString();
795             try {
796                 CBioseq::TId ids;
797                 CSeq_id::ParseFastaIds(ids, id_list);
798                 seqid_subj = FindBestChoice(ids, objects::CSeq_id::Score);
799                 if(!seqid_subj) seqid_subj = new CSeq_id(CSeq_id::e_Local, id_list);
800 
801             } catch(CException) {
802                 seqid_subj.Reset (new CSeq_id(CSeq_id::e_Local, id_list));
803             }
804         } else {
805             NCBI_THROW(CSplignAppException,
806                        eBadParameter,
807                        "Subject is missing. "
808                        "Specify -help to print arguments." );
809         }
810 
811         THitRefs hitrefs;
812         x_GetBl2SeqHits(seqid_query, seqid_subj, scope, &hitrefs);
813         x_ProcessPair(hitrefs, args);
814     }
815     else if (run_mode == eBatch1) {
816 
817         THitRefs hitrefs;
818         CNcbiIstream& hit_stream = args["hits"].AsInputFile();
819         while(x_GetNextPair(hit_stream, &hitrefs) ) {
820             x_ProcessPair(hitrefs, args);
821         }
822     }
823     else if (run_mode == eBatch2) {
824 
825         CNcbiIstream& hit_stream (args["comps"].AsInputFile());
826         THitRefs hitrefs;
827         THit::TCoord subj_min, subj_max;
828 
829         while(x_GetNextComp(hit_stream, &hitrefs, &subj_min, &subj_max) ) {
830 
831             if(hitrefs.front()->GetScore() > 0) {
832                 x_ProcessPair(hitrefs, args, subj_min, subj_max);
833             }
834         }
835     }
836     else {
837         NCBI_THROW(CSplignAppException,
838                    eInternal,
839                    "Mode not implemented");
840     }
841 
842     cout << "# END" << endl;
843 
844     return 0;
845 }
846 
847 
x_GetBl2SeqHits(CRef<objects::CSeq_id> seqid_query,CRef<objects::CSeq_id> seqid_subj,CRef<objects::CScope> scope,THitRefs * phitrefs)848 void CSplignApp::x_GetBl2SeqHits(
849     CRef<objects::CSeq_id> seqid_query,
850     CRef<objects::CSeq_id> seqid_subj,
851     CRef<objects::CScope>  scope,
852     THitRefs* phitrefs)
853 {
854     USING_SCOPE(blast);
855     USING_SCOPE(objects);
856 
857     phitrefs->resize(0);
858     phitrefs->reserve(100);
859 
860     CRef<CSeq_loc> seqloc_query (new CSeq_loc);
861     seqloc_query->SetWhole().Assign(*seqid_query);
862     CRef<CSeq_loc> seqloc_subj (new CSeq_loc);
863     seqloc_subj->SetWhole().Assign(*seqid_subj);
864 
865     CBl2Seq Blast( SSeqLoc(seqloc_query.GetNonNullPointer(),
866                            scope.GetNonNullPointer()),
867                    SSeqLoc(seqloc_subj.GetNonNullPointer(),
868                            scope.GetNonNullPointer()),
869                    m_BlastProgram);
870 
871     Blast.SetOptionsHandle() = *m_BlastOptionsHandle;
872 
873     TSeqAlignVector blast_output (Blast.Run());
874 
875     ITERATE(TSeqAlignVector, ii, blast_output) {
876         if((*ii)->IsSet()) {
877             const CSeq_align_set::Tdata &sas0 = (*ii)->Get();
878             ITERATE(CSeq_align_set::Tdata, sa_iter, sas0) {
879                     CRef<CBlastTabular> hitref (new CBlastTabular(**sa_iter));
880                     if(hitref->GetQueryStrand() == false) {
881                         hitref->FlipStrands();
882                     }
883                     phitrefs->push_back(hitref);
884             }
885         }
886     }
887 }
888 
x_RunSplign(bool raw_hits,THitRefs * phitrefs,THit::TCoord smin,THit::TCoord smax,CSplign::TResults * psplign_results)889 void CSplignApp::x_RunSplign(bool raw_hits, THitRefs* phitrefs,
890                              THit::TCoord smin, THit::TCoord smax,
891                              CSplign::TResults * psplign_results)
892 {
893     if(raw_hits) {
894         m_Splign->Run(phitrefs);
895         const CSplign::TResults& results (m_Splign->GetResult());
896         copy(results.begin(), results.end(), back_inserter(*psplign_results));
897     }
898     else {
899         CSplign::SAlignedCompartment ac;
900         m_Splign->AlignSingleCompartment(phitrefs, smin, smax, &ac);
901         psplign_results->push_back(ac);
902     }
903 }
904 
905 
906 // get the number of non-consensus splices in a compartment
907 // with the highest match count
GetNonConsensusSpliceCount(const CSplign::TResults & splign_results)908 size_t GetNonConsensusSpliceCount(const CSplign::TResults & splign_results)
909 {
910     size_t top_matches (0);
911     size_t rv (0);
912     ITERATE(CSplign::TResults, ii, splign_results) {
913 
914         const CSplign::SAlignedCompartment & ac (*ii);
915         size_t matches (0), nc_count(0);
916         typedef CSplign::TSegments::const_iterator TIterator;
917         char dnr [] = {0, 0, 0};
918         char acc [] = {0, 0, 0};
919         size_t exon_count (0);
920 
921         for(TIterator jjb (ac.m_Segments.begin()), jje (ac.m_Segments.end()), jj(jjb);
922             jj != jje; ++jj)
923         {
924 
925             if(jj->m_exon) {
926 
927                 const char * p (jj->m_details.data()), * pe (p +jj->m_details.size());
928                 int n (-1);
929                 for(; p != pe; ++p) {
930                     if(*p == 'M') {
931                         if(n == 0) ++matches; else if(n > 0) matches += n;
932                         n = 0;
933                     }
934                     else if(isdigit(*p) && n >= 0) {
935                         n = n * 10 + *p - '0';
936                     }
937                     else {
938                         if(n == 0) {
939                             ++matches;
940                         }
941                         n = -1;
942                     }
943                 }
944                 if(n == 0) ++matches; else if(n > 0) matches += n;
945 
946                 if(exon_count > 0) {
947 
948                     if(jj->m_annot[2] == '<') {
949 
950                         acc[0] = jj->m_annot[0];
951                         acc[1] = jj->m_annot[1];
952 
953                         if(!CNWFormatter::SSegment::s_IsConsensusSplice(dnr, acc)) {
954                             ++nc_count;
955                         }
956                     }
957                     acc[0] = acc[1] = 0;
958                 }
959 
960                 p = jj->m_annot.data();
961                 while(*p++ != '>');
962                 dnr[0] = *p++;
963                 dnr[1] = *p;
964 
965                 ++exon_count;
966             }
967         }
968 
969         if(matches > top_matches) {
970             rv = nc_count;
971             top_matches = matches;
972         }
973     }
974 
975     return rv;
976 }
977 
978 
979 struct SComplement
980 {
operator ()SComplement981     char operator() (char c) {
982         switch(c) {
983         case 'A': return 'T';
984         case 'G': return 'C';
985         case 'T': return 'A';
986         case 'C': return 'G';
987         }
988         return c;
989     }
990 };
991 
992 
x_ProcessPair(THitRefs & hitrefs,const CArgs & args,THit::TCoord smin,THit::TCoord smax)993 void CSplignApp::x_ProcessPair(THitRefs& hitrefs, const CArgs& args,
994                                THit::TCoord smin, THit::TCoord smax)
995 {
996 
997     const int flags (CSplignFormatter::eTF_NoExonScores | CSplignFormatter::eTF_UseFastaStyleIds);
998 
999     const bool raw_hits (!args["comps"]);
1000 
1001     if(hitrefs.size() == 0) {
1002         return;
1003     }
1004 
1005     // skip void compartments but obey their bounds
1006     if(hitrefs.front()->GetScore() < 0) {
1007         return;
1008     }
1009 
1010     // MASK TEST
1011     {{
1012         CSeq_id_Handle qidh = CSeq_id_Handle::GetHandle(*(hitrefs.front()->GetQueryId()));
1013         CSplign::TSeqRangeColl MaskRanges;
1014         if(args["mask_ranges"].HasValue()) {
1015             try {
1016                 string mask_ranges_arg = args["mask_ranges"].AsString();
1017                 list<string> range_strs;
1018                 NStr::Split(mask_ranges_arg, ",", range_strs);
1019                 ITERATE(list<string>, range_iter, range_strs) {
1020                     list<string> bound_strs;
1021                     NStr::Split(*range_iter, "-", bound_strs);
1022                     TSeqRange Curr(NStr::StringToNumeric<TSeqPos>(bound_strs.front())-1,
1023                                    NStr::StringToNumeric<TSeqPos>(bound_strs.back())-1);
1024                     MaskRanges += Curr;
1025                 }
1026             } catch(CException& e) {
1027                 ERR_POST("Masking Ranges parse error: " + e.ReportAll());
1028             }
1029         }
1030         if(qidh && !MaskRanges.empty()) {
1031             m_Splign->SetHardMaskRanges(qidh, MaskRanges);
1032         }
1033     }}
1034 
1035     THit::TId query (hitrefs.front()->GetQueryId());
1036     THit::TId subj  (hitrefs.front()->GetSubjId());
1037 
1038     m_Formatter->SetSeqIds(query, subj);
1039 
1040     string strand (args["direction"].AsString());
1041 
1042     if(strand == kDirDefault) {
1043         strand = (args["type"].AsString() == kQueryType_mRNA)? kDirAuto: kDirBoth;
1044     }
1045 
1046     CSplign::TResults splign_results;
1047 
1048     if(strand == kDirSense) {
1049 
1050         m_Splign->SetStrand(true);
1051         x_RunSplign(raw_hits, &hitrefs, smin, smax, &splign_results);
1052     }
1053     else if(strand == kDirAntisense) {
1054 
1055         m_Splign->SetStrand(false);
1056         x_RunSplign(raw_hits, &hitrefs, smin, smax, &splign_results);
1057     }
1058     else if(strand == kDirBoth) {
1059 
1060         // save original hits
1061         THitRefs hits0;
1062         ITERATE(THitRefs, ii, hitrefs) {
1063             const THitRef & h0 (*ii);
1064             THitRef h1 (new THit (*h0));
1065             hits0.push_back(h1);
1066         }
1067 
1068         static size_t mid (1);
1069         size_t mid_plus, mid_minus;
1070         {{
1071             m_Splign->SetStrand(true);
1072             m_Splign->SetStartModelId(mid);
1073             x_RunSplign(raw_hits, &hitrefs, smin, smax, &splign_results);
1074             mid_plus = m_Splign->GetNextModelId();
1075         }}
1076         {{
1077             m_Splign->SetStrand(false);
1078             m_Splign->SetStartModelId(mid);
1079             x_RunSplign(raw_hits, &hits0, smin, smax, &splign_results);
1080             mid_minus = m_Splign->GetNextModelId();
1081         }}
1082         mid = max(mid_plus, mid_minus);
1083     }
1084     else {
1085 
1086         // 'auto' means to align both directions when in doubt
1087 
1088         THitRefs hits0;
1089         ITERATE(THitRefs, ii, hitrefs) {
1090             const THitRef & h0 (*ii);
1091             THitRef h1 (new THit (*h0));
1092             hits0.push_back(h1);
1093         }
1094 
1095         // determine the direction with the longest ORF
1096         const CSplign::TOrfPair orfs (m_Splign->GetCds(hitrefs.front()->GetQueryId()));
1097         const size_t orf_sense (orfs.first.second - orfs.first.first);
1098         const size_t orf_antisense (orfs.second.first - orfs.second.second);
1099         const bool sense_first (orf_sense >= orf_antisense);
1100 
1101         static size_t mid (1);
1102         size_t mid_first, mid_second;
1103 
1104         // align in the longest ORF direction
1105         m_Splign->SetStrand(sense_first);
1106         m_Splign->SetStartModelId(mid);
1107         x_RunSplign(raw_hits, &hitrefs, smin, smax, &splign_results);
1108         mid_first = m_Splign->GetNextModelId();
1109 
1110         // if there is a non-consensus splice, also align in the opposite direction
1111         const size_t nc_count (GetNonConsensusSpliceCount(splign_results));
1112 
1113         // same if there is a poly-a in the opposite direction
1114         bool polya_found (false);
1115         if(nc_count == 0) {
1116             CRef<CScope> scope (m_Splign->GetScope());
1117             CConstRef<CSeq_id> seqid_query (hits0.front()->GetQueryId());
1118             CBioseq_Handle bh (scope->GetBioseqHandle(*seqid_query));
1119                                CSeqVector sv (bh.GetSeqVector(CBioseq_Handle
1120                                                               ::eCoding_Iupac));
1121             string str;
1122             sv.GetSeqData(0, sv.size(), str);
1123             if(sense_first) {
1124                 reverse (str.begin(), str.end());
1125                 transform(str.begin(), str.end(), str.begin(), SComplement());
1126             }
1127             const size_t polya (CSplign::s_TestPolyA(str.data(), str.size()));
1128             polya_found = (0 < polya && polya < str.size());
1129         }
1130 
1131         if(nc_count > 0 || polya_found) {
1132             m_Splign->SetStrand(!sense_first);
1133             m_Splign->SetStartModelId(mid);
1134             x_RunSplign(raw_hits, &hits0, smin, smax, &splign_results);
1135             mid_second = m_Splign->GetNextModelId();
1136             mid = max(mid_first, mid_second);
1137         }
1138         else {
1139             mid = mid_first;
1140         }
1141     }
1142 
1143     cout << m_Formatter->AsExonTable(&splign_results, flags);
1144 
1145     if(m_AsnOut) {
1146 
1147         const CSplignFormatter::EAsnFlags flags
1148             = CSplignFormatter::EAsnFlags(CSplignFormatter::eAF_SplicedSegWithParts);
1149 
1150         CRef<CSeq_align_set> sas (m_Formatter->AsSeqAlignSet(&splign_results,
1151                                                              flags));
1152 
1153         CScoreBuilderBase score_builder;
1154         NON_CONST_ITERATE (CSeq_align_set::Tdata, align_it, sas->Set()) {
1155             score_builder.AddScore(*m_Splign->GetScope(), **align_it,
1156                                    CSeq_align::eScore_ConsensusSplices);
1157             score_builder.AddScore(*m_Splign->GetScope(), **align_it,
1158                                    CSeq_align::eScore_Splices);
1159         }
1160 
1161 
1162         *m_AsnOut << MSerial_AsnText  << *sas << endl;
1163     }
1164 
1165     if(m_AlnOut) {
1166         *m_AlnOut << m_Formatter->AsAlignmentText(m_Splign->GetScope(),
1167                                                   &splign_results);
1168     }
1169 
1170     ITERATE(CSplign::TResults, ii, splign_results) {
1171         x_LogCompartmentStatus(query, subj, *ii);
1172     }
1173 }
1174 
1175 
1176 END_NCBI_SCOPE
1177 
1178 /////////////////////////////////////
1179 
1180 USING_NCBI_SCOPE;
1181 
main(int argc,const char * argv[])1182 int main(int argc, const char* argv[])
1183 {
1184     const int rv (CSplignApp().AppMain(argc, argv));
1185     return rv;
1186 }
1187