1 /* $Id: ngalign_app.cpp 548810 2017-10-18 13:38:41Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Nathan Bouk, Mike DiCuccio
27 *
28 * File Description:
29 *
30 */
31
32
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbiapp.hpp>
35 #include <corelib/ncbienv.hpp>
36 #include <corelib/ncbiargs.hpp>
37
38 #include <objmgr/object_manager.hpp>
39 #include <objtools/data_loaders/genbank/gbloader.hpp>
40 #include <objmgr/scope.hpp>
41 #include <objects/general/User_object.hpp>
42 #include <objects/general/User_field.hpp>
43 #include <objects/general/Object_id.hpp>
44 #include <objects/seqalign/Seq_align.hpp>
45 #include <objects/seq/Bioseq.hpp>
46 #include <objects/seq/Annot_descr.hpp>
47 #include <objects/seq/Annotdesc.hpp>
48 #include <objects/seq/seq__.hpp>
49 #include <objtools/readers/agp_read.hpp>
50 #include <objtools/readers/agp_seq_entry.hpp>
51 #include <objtools/data_loaders/blastdb/bdbloader.hpp>
52 #include <objtools/readers/fasta.hpp>
53
54 #include <util/line_reader.hpp>
55 #include <objects/genomecoll/genome_collection__.hpp>
56
57 #include <algo/align/ngalign/ngalign.hpp>
58 #include <algo/align/ngalign/alignment_filterer.hpp>
59 #include <algo/align/ngalign/sequence_set.hpp>
60 #include <algo/align/ngalign/blast_aligner.hpp>
61 #include <algo/align/ngalign/banded_aligner.hpp>
62 #include <algo/align/ngalign/merge_aligner.hpp>
63 //#include <algo/align/ngalign/coverage_aligner.hpp>
64 //#include <algo/align/ngalign/overlap_aligner.hpp>
65 #include <algo/align/ngalign/inversion_merge_aligner.hpp>
66 #include <algo/align/ngalign/alignment_scorer.hpp>
67 #include <algo/align/ngalign/unordered_spliter.hpp>
68
69 #include <algo/align/ngalign/merge_tree_aligner.hpp>
70
71 #include <algo/align/ngalign/ngalign_interface.hpp>
72
73 #include <objtools/data_loaders/asn_cache/Cache_blob.hpp>
74 #include <objtools/data_loaders/asn_cache/asn_index.hpp>
75 #include <objtools/data_loaders/asn_cache/chunk_file.hpp>
76 #include <objtools/data_loaders/asn_cache/asn_cache_util.hpp>
77 #include <objtools/data_loaders/asn_cache/seq_id_chunk_file.hpp>
78 #include <objtools/data_loaders/asn_cache/asn_cache_loader.hpp>
79
80
81 BEGIN_NCBI_SCOPE
82 USING_SCOPE(objects);
83
84 class CFileLoadAligner : public IAlignmentFactory
85 {
86 public:
87
CFileLoadAligner(const string & name)88 CFileLoadAligner(const string& name) : FileName(name) { ; }
89 private:
90 string FileName;
91
92 public:
93
GetName() const94 string GetName() const { return "file_load_aligner"; }
95
GenerateAlignments(objects::CScope & Scope,ISequenceSet * QuerySet,ISequenceSet * SubjectSet,TAlignResultsRef AccumResults)96 TAlignResultsRef GenerateAlignments(objects::CScope& Scope,
97 ISequenceSet* QuerySet,
98 ISequenceSet* SubjectSet,
99 TAlignResultsRef AccumResults)
100 {
101 TAlignResultsRef NewResults(new CAlignResultsSet);
102
103 CNcbiIfstream In(FileName.c_str());
104 while(In) {
105 size_t Pos = In.tellg();
106 try {
107 CRef<CSeq_align> Temp(new CSeq_align);
108 In >> MSerial_AsnText >> *Temp;
109 NewResults->Insert(Temp);
110 cerr << "FileLoad Aligner: 1 Seq-align" << endl;
111 continue;
112 } catch(...) {
113 In.seekg(Pos);
114 }
115 try {
116 CRef<CSeq_align_set> Temp(new CSeq_align_set);
117 In >> MSerial_AsnText >> *Temp;
118 NewResults->Insert(*Temp);
119 cerr << "FileLoad Aligner: " << Temp->Get().size() << " Seq-align-set" << endl;
120 continue;
121 } catch(...) {
122 In.seekg(Pos);
123 }
124 try {
125 CRef<CSeq_annot> Temp(new CSeq_annot);
126 In >> MSerial_AsnText >> *Temp;
127 CRef<CSeq_align_set> TempSet(new CSeq_align_set);
128 TempSet->Set().insert(TempSet->Set().end(), Temp->SetData().SetAlign().begin(), Temp->SetData().SetAlign().end());
129 NewResults->Insert(*TempSet);
130
131 cerr << "FileLoad Aligner: " << TempSet->Get().size() << " Seq-annot" << endl;
132 continue;
133 } catch(...) {
134 In.seekg(Pos);
135 }
136 break;
137 }
138
139 return NewResults;
140 }
141 };
142
143
144
145 class CNgAlignApp : public CNcbiApplication
146 {
147 private:
148 void Init();
149 int Run();
150
151
152 CRef<CScope> m_Scope;
153
154 auto_ptr<CUnorderedSplitter> m_Splitter;
155
156 list<CRef<CSeq_id> > m_LoadedIds;
157 list<CRef<CSeq_id> >::const_iterator LoadedIdsIter;
158
159 int BatchCounter;
160
161 bool x_CreateNgAlignRun(CNgAligner& NgAligner, IRegistry* RunRegistry);
162 CRef<ISequenceSet> x_CreateSequenceSet(IRegistry* RunRegistry,
163 const string& Category);
164
165 void x_LoadExternalSequences(IRegistry* RunRegistry,
166 const string& Category,
167 list<CRef<CSeq_id> >& LoadedIds);
168
169 void x_AddScorers(CNgAligner& NgAligner, IRegistry* RunRegistry);
170 void x_AddFilters(CNgAligner& NgAligner, IRegistry* RunRegistry);
171 void x_AddAligners(CNgAligner& NgAligner, IRegistry* RunRegistry);
172
173 CRef<CBlastAligner> x_CreateBlastAligner(IRegistry* RunRegistry, const string& Name);
174 CRef<CRemoteBlastAligner> x_CreateRemoteBlastAligner(IRegistry* RunRegistry, const string& Name);
175 CRef<CMergeAligner> x_CreateMergeAligner(IRegistry* RunRegistry, const string& Name);
176 // CRef<CCoverageAligner> x_CreateCoverageAligner(IRegistry* RunRegistry, const string& Name);
177 // CRef<CMergeTreeAligner> x_CreateMergeTreeAligner(IRegistry* RunRegistry, const string& Name);
178 CRef<CInversionMergeAligner> x_CreateInversionMergeAligner(IRegistry* RunRegistry, const string& Name);
179 CRef<CSplitSeqAlignMerger> x_CreateSplitSeqMergeAligner(IRegistry* RunRegistry, const string& Name);
180
181 CRef<CFileLoadAligner> x_CreateFileLoadAligner(IRegistry* RunRegistry, const string& Name);
182
183
184 void x_RecurseSeqEntry(CRef<CSeq_entry> Top, list<CRef<CSeq_id> >& SeqIdList);
185
186 CRef<CGC_Assembly> x_LoadAssembly(CNcbiIstream& In);
187
188 void x_PrintNoHitList(CNcbiOstream& Out, const CSeq_align_set& Alignments);
189 };
190
191
Init()192 void CNgAlignApp::Init()
193 {
194 auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
195 arg_desc->SetUsageContext(GetArguments().GetProgramName(),
196 "Generic app for creating an NgAlign "
197 "run, and running it.");
198
199 arg_desc->AddFlag("info", "info level logging");
200
201 arg_desc->AddDefaultKey("o", "OutputFile",
202 "File for results",
203 CArgDescriptions::eOutputFile,
204 "-");
205 arg_desc->AddFlag("b", "Output binary ASN.1");
206
207 arg_desc->AddFlag("dup", "allow dupes");
208
209 arg_desc->AddKey("run", "InputFile", "ngalign run ini file",
210 CArgDescriptions::eInputFile);
211
212 arg_desc->AddOptionalKey("asn_cache", "string", "comma seperated paths",
213 CArgDescriptions::eString);
214 arg_desc->AddFlag("G", "No genbank");
215
216 arg_desc->AddOptionalKey("query", "string", "single seq-id",
217 CArgDescriptions::eString);
218 arg_desc->AddOptionalKey("subject", "string", "single seq-id",
219 CArgDescriptions::eString);
220 arg_desc->AddOptionalKey("qidlist", "file", "seq-id list file",
221 CArgDescriptions::eInputFile);
222 arg_desc->AddOptionalKey("sidlist", "file", "seq-id list file",
223 CArgDescriptions::eInputFile);
224 arg_desc->AddOptionalKey("idlist", "file", "fasta seq-id list file",
225 CArgDescriptions::eInputFile);
226 arg_desc->AddOptionalKey("qloclist", "file", "seq-loc list file",
227 CArgDescriptions::eInputFile);
228 arg_desc->AddOptionalKey("sloclist", "file", "seq-loc list file",
229 CArgDescriptions::eInputFile);
230 arg_desc->AddOptionalKey("blastdb", "blastdb", "Blastdb Name",
231 CArgDescriptions::eInputFile);
232 arg_desc->AddOptionalKey("softfilter", "string", "blastdb soft filter, int or string",
233 CArgDescriptions::eString);
234 arg_desc->AddOptionalKey("fasta", "file", "fasta file",
235 CArgDescriptions::eInputFile);
236
237 arg_desc->AddOptionalKey("seqentry", "file", "seq-entry file",
238 CArgDescriptions::eInputFile);
239 arg_desc->AddOptionalKey("agp", "file", "agp file",
240 CArgDescriptions::eInputFile);
241
242 arg_desc->AddOptionalKey("gc", "file", "gencoll asn.1 Assembly, for subject side. Allows for seperate ranking by assembly-unit.",
243 CArgDescriptions::eInputFile);
244
245 arg_desc->AddDefaultKey("batch", "int", "batch size for loaded ids",
246 CArgDescriptions::eInteger, "100");
247
248 arg_desc->AddOptionalKey("nohit", "outfile", "List of nohit IDs",
249 CArgDescriptions::eOutputFile);
250
251 arg_desc->AddOptionalKey("filters", "string",
252 "semi-colon seperated list of filters, overrides the ini file",
253 CArgDescriptions::eString);
254 arg_desc->AddOptionalKey("scorers", "string",
255 "semi-colon seperated list of scorers, overrides the ini file",
256 CArgDescriptions::eString);
257
258
259 SetupArgDescriptions(arg_desc.release());
260 }
261
262
Run()263 int CNgAlignApp::Run()
264 {
265 const CArgs& args = GetArgs();
266
267 if(args["info"].HasValue())
268 SetDiagPostLevel(eDiag_Info);
269
270 // Scope things
271 CRef<CObjectManager> om(CObjectManager::GetInstance());
272
273 if(!args["G"].HasValue())
274 CGBDataLoader::RegisterInObjectManager(*om);
275
276 if(args["blastdb"].HasValue()) {
277 string Orig = args["blastdb"].AsString();
278 vector<string> DBs;
279 NStr::Split(Orig, ",", DBs);
280 ITERATE(vector<string>, DBIter, DBs) {
281 cerr << *DBIter << endl;
282 CBlastDbDataLoader::RegisterInObjectManager
283 (*om, *DBIter,
284 CBlastDbDataLoader::eNucleotide,
285 true, CObjectManager::eDefault, 100);
286 }
287 }
288
289
290 if(args["asn_cache"].HasValue()) {
291 vector<string> Tokens;
292 NStr::Split(args["asn_cache"].AsString(), ",", Tokens);
293 int Priority = 1;
294 ITERATE(vector<string>, CacheIter, Tokens) {
295 CFile CacheFile( *CacheIter + "/asn_cache.idx" );
296 if(CacheFile.Exists()) {
297 CAsnCache_DataLoader::RegisterInObjectManager(*om, *CacheIter,
298 CObjectManager::eDefault, Priority);
299 Priority++;
300 }
301 }
302 }
303
304
305 m_Scope.Reset(new CScope(*om));
306 m_Scope->AddDefaults();
307
308 // end scope
309
310 m_Splitter.reset(new CUnorderedSplitter(*m_Scope));
311
312 CNcbiRegistry RunRegistry(args["run"].AsInputFile());
313
314 bool AllowDupes = false;
315 if(RunRegistry.Get("ngalign", "allow_dupes") == "true")
316 AllowDupes = true;
317 if(args["dup"].HasValue() && args["dup"].AsBoolean())
318 AllowDupes = true;
319 ERR_POST(Info << "Dupes: " << AllowDupes);
320
321 CRef<CGC_Assembly> Assembly;
322 if(args["gc"].HasValue()) {
323 Assembly = x_LoadAssembly(args["gc"].AsInputFile());
324 }
325
326
327 BatchCounter = 0;
328
329 x_LoadExternalSequences(&RunRegistry, "", m_LoadedIds);
330 LoadedIdsIter = m_LoadedIds.begin();
331
332
333 CRef<CSeq_align_set> Alignments(new CSeq_align_set);
334
335 do {
336 CNgAligner NgAligner(*m_Scope, Assembly, AllowDupes);
337
338 bool Created;
339 Created = x_CreateNgAlignRun(NgAligner, &RunRegistry);
340
341 if(!Created) {
342 cerr << "Failed to create Run." << endl;
343 return 1;
344 }
345
346 CRef<CSeq_align_set> CurrAligns;
347 try {
348 CurrAligns = NgAligner.Align();
349 } catch(CException& e) {
350 ERR_POST(Warning << e.ReportAll());
351 break;
352 }
353
354 if(CurrAligns) {
355 Alignments->Set().insert(Alignments->Set().end(),
356 CurrAligns->Get().begin(),
357 CurrAligns->Get().end());
358 }
359 //break;
360 ERR_POST(Info << "Looping" );
361 } while(!m_LoadedIds.empty() && LoadedIdsIter != m_LoadedIds.end() );
362
363 if(!Alignments.IsNull()) {
364
365 /*{{
366 CRef<CSeq_annot> Old(new CSeq_annot), New(new CSeq_annot), Blast(new CSeq_annot);
367 Blast->SetTitle("Blast Aligner");
368 Blast->AddName("Blast Aligner");
369 Old->SetTitle("Old Merge Aligner");
370 Old->AddName("Old Merge Aligner");
371 New->SetTitle("New Merge Aligner");
372 New->AddName("New Merge Aligner");
373 ITERATE(CSeq_align_set::Tdata, AlignIter, Alignments->Get()) {
374 int BlastScore=0, OldScore=0, NewScore=0;
375 (*AlignIter)->GetNamedScore("blast_aligner", BlastScore);
376 (*AlignIter)->GetNamedScore("merge_aligner", OldScore);
377 (*AlignIter)->GetNamedScore("new_merge_aligner", NewScore);
378
379 if(BlastScore)
380 Blast->SetData().SetAlign().push_back(*AlignIter);
381 else if(OldScore)
382 Old->SetData().SetAlign().push_back(*AlignIter);
383 else if(NewScore)
384 New->SetData().SetAlign().push_back(*AlignIter);
385 }
386 CNcbiOstream& ostr = args["o"].AsOutputFile();
387 ostr << MSerial_AsnText << *Old;
388 ostr << MSerial_AsnText << *New;
389 ostr << MSerial_AsnText << *Blast;
390 return 1;
391 }}*/
392
393 CNcbiOstream& ostr = args["o"].AsOutputFile();
394 if(args["b"].HasValue() && args["b"].AsBoolean())
395 ostr << MSerial_AsnBinary << *Alignments;
396 else
397 ostr << MSerial_AsnText << *Alignments;
398
399 if(args["nohit"].HasValue()) {
400 x_PrintNoHitList(args["nohit"].AsOutputFile(), *Alignments);
401 }
402 } else {
403 cerr << "No alignments found." << endl;
404 return 1;
405 }
406
407 return 0;
408 }
409
410
x_CreateNgAlignRun(CNgAligner & NgAligner,IRegistry * RunRegistry)411 bool CNgAlignApp::x_CreateNgAlignRun(CNgAligner& NgAligner, IRegistry* RunRegistry)
412 {
413 CRef<ISequenceSet> Query, Subject;
414 Query = x_CreateSequenceSet(RunRegistry, "query");
415 Subject = x_CreateSequenceSet(RunRegistry, "subject");
416
417 if(Query.IsNull() || Subject.IsNull())
418 return false;
419
420 NgAligner.SetQuery(Query);
421 NgAligner.SetSubject(Subject);
422
423 x_AddScorers(NgAligner, RunRegistry);
424 x_AddFilters(NgAligner, RunRegistry);
425
426 x_AddAligners(NgAligner, RunRegistry);
427
428 return true;
429 }
430
431
432 CRef<ISequenceSet>
x_CreateSequenceSet(IRegistry * RunRegistry,const string & Category)433 CNgAlignApp::x_CreateSequenceSet(IRegistry* RunRegistry,
434 const string& Category)
435 {
436 CRef<ISequenceSet> Result;
437 const string Type = RunRegistry->Get(Category, "type");
438
439 const string Source = RunRegistry->Get(Category, "source");
440
441 const string Mask = RunRegistry->Get(Category, "mask");
442 CSeqMasker* Masker = NULL;
443 if(!Mask.empty()) {
444 string NMer = "/panfs/pan1/gpipe/ThirdParty/WindowMasker/data/" + Mask;
445 Masker = new CSeqMasker(NMer,
446 0, 1, 1, 0, 0, 0, 0, 0, 0, false, 0, 0, 0, 0, "mean", 0, false, 0, false);
447 }
448
449 const CArgs& Args = GetArgs();
450
451
452 if(Type == "seqidlist") {
453 CRef<CSeqIdListSet> IdList(new CSeqIdListSet);
454
455 if(Args["query"].HasValue() && Source.empty() && Category == "query") {
456 const string& Id = Args["query"].AsString();
457 CRef<CSeq_id> QueryId(new CSeq_id(Id));
458 IdList->SetIdList().push_back(QueryId);
459 }
460 if(Args["subject"].HasValue() && Source.empty() && Category == "subject") {
461 const string& Id = Args["subject"].AsString();
462 CRef<CSeq_id> SubjectId(new CSeq_id(Id));
463 IdList->SetIdList().push_back(SubjectId);
464 }
465 if(Args["qidlist"].HasValue() && Source.empty() && Category == "query") {
466 CNcbiIstream& In = Args["qidlist"].AsInputFile();
467 CStreamLineReader Reader(In);
468 while(!Reader.AtEOF()) {
469 ++Reader;
470 string Line = *Reader;
471 if(!Line.empty() && Line[0] != '#') {
472 Line = Line.substr(0, Line.find(" "));
473 CRef<CSeq_id> QueryId(new CSeq_id(Line));
474 IdList->SetIdList().push_back(QueryId);
475 }
476 }
477 }
478 if(Args["sidlist"].HasValue() && Source.empty() && Category == "subject") {
479 CNcbiIstream& In = Args["sidlist"].AsInputFile();
480 CStreamLineReader Reader(In);
481 while(!Reader.AtEOF()) {
482 ++Reader;
483 string Line = *Reader;
484 if(!Line.empty() && Line[0] != '#') {
485 Line = Line.substr(0, Line.find(" "));
486 CRef<CSeq_id> SubjectId(new CSeq_id(Line));
487 IdList->SetIdList().push_back(SubjectId);
488 }
489 }
490 }
491
492 if(Args["idlist"].HasValue() && Source.empty() && m_LoadedIds.empty()) {
493 CNcbiIstream& In = Args["idlist"].AsInputFile();
494 CStreamLineReader Reader(In);
495 while(!Reader.AtEOF()) {
496 ++Reader;
497 string Line = *Reader;
498 if(!Line.empty() && Line[0] != '#') {
499 Line = Line.substr(0, Line.find(" "));
500 CRef<CSeq_id> QueryId(new CSeq_id(Line));
501 m_LoadedIds.push_back(QueryId);
502 //IdList->SetIdList().push_back(QueryId);
503 }
504 }
505 LoadedIdsIter = m_LoadedIds.begin();
506 }
507
508 if(IdList->SetIdList().empty() && !m_LoadedIds.empty()) {
509 int BatchSize = Args["batch"].AsInteger();
510 for(int Count = 0;
511 Count < BatchSize && LoadedIdsIter != m_LoadedIds.end();
512 Count++) {
513 //cerr << __LINE__ << (*LoadedIdsIter)->AsFastaString() << endl;
514 IdList->SetIdList().push_back(*LoadedIdsIter);
515 ++LoadedIdsIter;
516 }
517 }
518
519 if(Masker != NULL)
520 IdList->SetSeqMasker(Masker);
521
522 Result.Reset(IdList.GetPointer());
523 }
524 else if(Type == "seqloclist") {
525 CRef<CSeqLocListSet> LocList(new CSeqLocListSet);
526
527 if(Args["query"].HasValue() && Source.empty() && Category == "query" ) {
528 string QueryString = Args["query"].AsString();
529 //CStreamLineReader Reader(In);
530 //while(!Reader.AtEOF()) {
531 // ++Reader;
532 string Line = QueryString;
533 if(!Line.empty() && Line[0] != '#') {
534 vector<string> Tokens;
535 NStr::Split(Line, "\t -.", Tokens, NStr::fSplit_Tokenize);
536 CRef<CSeq_loc> Loc(new CSeq_loc);
537
538 Loc->SetInt().SetId().Set(Tokens[0]);
539 if(Loc->GetInt().GetId().IsGi() && Loc->GetInt().GetId().GetGi() < GI_CONST(50)) {
540 Loc->SetInt().SetId().SetLocal().SetId() = NStr::StringToInt(Tokens[0]);
541 }
542
543 Loc->SetInt().SetFrom() = NStr::StringToInt(Tokens[1])-1;
544 Loc->SetInt().SetTo() = NStr::StringToInt(Tokens[2])-1;
545 LocList->SetLocList().push_back(Loc);
546 }
547 //}
548 }
549 else if(Args["qloclist"].HasValue() && Source.empty() && Category == "query" ) {
550 CNcbiIstream& In = Args["qloclist"].AsInputFile();
551 CStreamLineReader Reader(In);
552 while(!Reader.AtEOF()) {
553 ++Reader;
554 string Line = *Reader;
555 if(!Line.empty() && Line[0] != '#') {
556 vector<string> Tokens;
557 NStr::Split(Line, "\t -.", Tokens, NStr::fSplit_Tokenize);
558 CRef<CSeq_loc> Loc(new CSeq_loc);
559
560 Loc->SetInt().SetId().Set(Tokens[0]);
561 if(Loc->GetInt().GetId().IsGi() && Loc->GetInt().GetId().GetGi() < GI_CONST(50)) {
562 Loc->SetInt().SetId().SetLocal().SetId() = NStr::StringToInt(Tokens[0]);
563 }
564
565 Loc->SetInt().SetFrom() = NStr::StringToInt(Tokens[1])-1;
566 Loc->SetInt().SetTo() = NStr::StringToInt(Tokens[2])-1;
567 LocList->SetLocList().push_back(Loc);
568 }
569 }
570 }
571
572 else if(Args["subject"].HasValue() && Source.empty() && Category == "subject" ) {
573 string SubjectString = Args["subject"].AsString();
574 //CStreamLineReader Reader(In);
575 //while(!Reader.AtEOF()) {
576 // ++Reader;
577 string Line = SubjectString;
578 if(!Line.empty() && Line[0] != '#') {
579 vector<string> Tokens;
580 NStr::Split(Line, "\t -.", Tokens, NStr::fSplit_Tokenize);
581 CRef<CSeq_loc> Loc(new CSeq_loc);
582 Loc->SetInt().SetId().Set(Tokens[0]);
583 if(Loc->GetInt().GetId().IsGi() && Loc->GetInt().GetId().GetGi() < GI_CONST(50)) {
584 Loc->SetInt().SetId().SetLocal().SetId() = NStr::StringToInt(Tokens[0]);
585 }
586
587 Loc->SetInt().SetFrom() = NStr::StringToInt(Tokens[1])-1;
588 Loc->SetInt().SetTo() = NStr::StringToInt(Tokens[2])-1;
589 LocList->SetLocList().push_back(Loc);
590 }
591 //}
592 }
593
594 else if(Args["sloclist"].HasValue() && Source.empty() && Category == "subject" ) {
595 CNcbiIstream& In = Args["sloclist"].AsInputFile();
596 CStreamLineReader Reader(In);
597 while(!Reader.AtEOF()) {
598 ++Reader;
599 string Line = *Reader;
600 if(!Line.empty() && Line[0] != '#') {
601 vector<string> Tokens;
602 NStr::Split(Line, "\t -.", Tokens, NStr::fSplit_Tokenize);
603 CRef<CSeq_loc> Loc(new CSeq_loc);
604 Loc->SetInt().SetId().Set(Tokens[0]);
605 if(Loc->GetInt().GetId().IsGi() && Loc->GetInt().GetId().GetGi() < GI_CONST(50)) {
606 Loc->SetInt().SetId().SetLocal().SetId() = NStr::StringToInt(Tokens[0]);
607 }
608
609 Loc->SetInt().SetFrom() = NStr::StringToInt(Tokens[1])-1;
610 Loc->SetInt().SetTo() = NStr::StringToInt(Tokens[2])-1;
611 LocList->SetLocList().push_back(Loc);
612 }
613 }
614 }
615
616 if(Masker != NULL)
617 LocList->SetSeqMasker(Masker);
618
619 Result.Reset(LocList.GetPointer());
620 }
621 else if(Type == "blastdb") {
622 string Orig = Args["blastdb"].AsString();
623 vector<string> DBs;
624 NStr::Split(Orig, ",", DBs);
625 ITERATE(vector<string>, DBIter, DBs) {
626 CRef<CBlastDbSet> BlastDb(new CBlastDbSet(*DBIter));
627 if(Args["softfilter"].HasValue()) {
628 BlastDb->SetSoftFiltering(Args["softfilter"].AsString());
629 }
630 Result.Reset(BlastDb.GetPointer());
631 break;
632 }
633 }
634 else if(Type == "fasta") {
635 string FileName;
636 FileName = RunRegistry->Get(Category, "fasta");
637 if(Args["fasta"].HasValue())
638 FileName = Args["fasta"].AsString();
639 int Batch = -1;
640 if(Args["batch"].HasValue())
641 Batch = Args["batch"].AsInteger();
642 ERR_POST(Info << "Fasta File Name: " << FileName );
643 // yes, its being leaked.
644 CNcbiIfstream* FastaIn = new CNcbiIfstream(FileName.c_str());
645 CRef<CFastaFileSet> FastaSet;
646 if(Batch != -1) {
647 FastaSet.Reset(new CFastaFileSet(FastaIn, BatchCounter, Batch));
648 BatchCounter += Batch;
649 for(int i = 0; i < Batch; i++) {
650 if(LoadedIdsIter != m_LoadedIds.end())
651 ++LoadedIdsIter;
652 }
653 } else {
654 FastaSet.Reset(new CFastaFileSet(FastaIn));
655 }
656 Result.Reset(FastaSet.GetPointer());
657 }
658 else if(Type == "splitseqidlist") {
659 CRef<CSplitSeqIdListSet> IdList(new CSplitSeqIdListSet(m_Splitter.get()));
660
661 if(Args["query"].HasValue()) {
662 const string& Id = Args["query"].AsString();
663 CRef<CSeq_id> QueryId(new CSeq_id(Id));
664 IdList->AddSeqId(QueryId);
665 }
666
667 /*if(Args["idlist"].HasValue()) {
668 CNcbiIstream& In = Args["idlist"].AsInputFile();
669 CStreamLineReader Reader(In);
670 while(!Reader.AtEOF()) {
671 ++Reader;
672 string Line = *Reader;
673 if(!Line.empty() && Line[0] != '#') {
674 Line = Line.substr(0, Line.find(" "));
675 CRef<CSeq_id> QueryId(new CSeq_id(Line));
676 IdList->AddSeqId(QueryId);
677 }
678 }
679 }*/
680
681 if(Args["idlist"].HasValue() && Source.empty() && m_LoadedIds.empty()) {
682 CNcbiIstream& In = Args["idlist"].AsInputFile();
683 CStreamLineReader Reader(In);
684 while(!Reader.AtEOF()) {
685 ++Reader;
686 string Line = *Reader;
687 if(!Line.empty() && Line[0] != '#') {
688 Line = Line.substr(0, Line.find(" "));
689 CRef<CSeq_id> QueryId(new CSeq_id(Line));
690 m_LoadedIds.push_back(QueryId);
691 //IdList->SetIdList().push_back(QueryId);
692 }
693 }
694 LoadedIdsIter = m_LoadedIds.begin();
695 }
696
697 cerr << __LINE__ << endl;
698 if(IdList->Empty() && !m_LoadedIds.empty()) {
699 int BatchSize = Args["batch"].AsInteger();
700 for(int Count = 0;
701 Count < BatchSize && LoadedIdsIter != m_LoadedIds.end();
702 Count++) {
703 cerr << __LINE__ << (*LoadedIdsIter)->AsFastaString() << endl;
704 IdList->AddSeqId(*LoadedIdsIter);
705 //IdList->SetIdList().push_back(*LoadedIdsIter);
706 ++LoadedIdsIter;
707 }
708 }
709
710
711 if(IdList->Empty()) {
712 ERR_POST(Error << " Split Seq Id List is empty, maybe all gap? ");
713 }
714
715 if(Masker != NULL)
716 IdList->SetSeqMasker(Masker);
717
718 Result.Reset(IdList.GetPointer());
719 }
720 else if(Type == "splitseqloclist") {
721 CRef<CSplitSeqLocListSet> LocList(new CSplitSeqLocListSet(m_Splitter.get()));
722
723 if(Args["qloclist"].HasValue() && Source.empty() && Category == "query" ) {
724 CNcbiIstream& In = Args["qloclist"].AsInputFile();
725 CStreamLineReader Reader(In);
726 while(!Reader.AtEOF()) {
727 ++Reader;
728 string Line = *Reader;
729 if(!Line.empty() && Line[0] != '#') {
730 vector<string> Tokens;
731 NStr::Split(Line, "\t ", Tokens, NStr::fSplit_Tokenize);
732 CRef<CSeq_loc> Loc(new CSeq_loc);
733 if(Tokens.size() >= 3) {
734 Loc->SetInt().SetId().Set(Tokens[0]);
735 Loc->SetInt().SetFrom() = NStr::StringToInt(Tokens[1])-1;
736 Loc->SetInt().SetTo() = NStr::StringToInt(Tokens[2])-1;
737 } else {
738 Loc->SetWhole().Set(Tokens[0]);
739 }
740 LocList->AddSeqLoc(Loc);
741 }
742 }
743 }
744
745 if(Args["sloclist"].HasValue() && Source.empty() && Category == "subject" ) {
746 CNcbiIstream& In = Args["sloclist"].AsInputFile();
747 CStreamLineReader Reader(In);
748 while(!Reader.AtEOF()) {
749 ++Reader;
750 string Line = *Reader;
751 if(!Line.empty() && Line[0] != '#') {
752 vector<string> Tokens;
753 NStr::Split(Line, "\t ", Tokens, NStr::fSplit_Tokenize);
754 CRef<CSeq_loc> Loc(new CSeq_loc);
755 if(Tokens.size() >= 3) {
756 Loc->SetInt().SetId().Set(Tokens[0]);
757 Loc->SetInt().SetFrom() = NStr::StringToInt(Tokens[1])-1;
758 Loc->SetInt().SetTo() = NStr::StringToInt(Tokens[2])-1;
759 } else {
760 Loc->SetWhole().Set(Tokens[0]);
761 }
762 LocList->AddSeqLoc(Loc);
763 }
764 }
765 }
766 if(LocList->Empty()) {
767 ERR_POST(Error << " Split Seq Loc List is empty, maybe all gap? ");
768 }
769
770 if(Masker != NULL)
771 LocList->SetSeqMasker(Masker);
772
773 Result.Reset(LocList.GetPointer());
774 }
775
776 return Result;
777 }
778
779
x_LoadExternalSequences(IRegistry * RunRegistry,const string & Category,list<CRef<CSeq_id>> & LoadedIds)780 void CNgAlignApp::x_LoadExternalSequences(IRegistry* RunRegistry,
781 const string& Category,
782 list<CRef<CSeq_id> >& LoadedIds)
783 {
784 const CArgs& Args = GetArgs();
785
786 if(Args["fasta"].HasValue()) {
787 CNcbiIstream& In = Args["fasta"].AsInputFile();
788 CFastaReader FastaReader(In);
789
790 while(!FastaReader.AtEOF()) {
791 CRef<CSeq_entry> Entry;
792 try{
793 Entry = FastaReader.ReadOneSeq();
794 } catch(...) {
795 break;
796 }
797 m_Scope->AddTopLevelSeqEntry(*Entry);
798 LoadedIds.push_back( Entry->GetSeq().GetId().front() );
799 }
800 cerr << __LINE__ << "\t" << LoadedIds.size() << endl;
801 }
802
803
804 if(Args["seqentry"].HasValue()) {
805 CNcbiIstream& In = Args["seqentry"].AsInputFile();
806
807 while(!In.eof()) {
808 CRef<CSeq_entry> Top(new CSeq_entry);
809
810 size_t Pos = In.tellg();
811 try {
812 In >> MSerial_AsnText >> *Top;
813 } catch(...) {
814 if(In.eof())
815 continue;
816 In.clear();
817 In.seekg(Pos);
818 try {
819 In >> MSerial_AsnBinary >> *Top;
820 } catch(...) {
821 if(!In.eof())
822 cerr << "Read Failure" << endl;
823 }
824 }
825 if(In.eof())
826 continue;
827
828 x_RecurseSeqEntry(Top, LoadedIds);
829 }
830 }
831
832 if(Args["agp"].HasValue()) {
833 CNcbiIstream& In = Args["agp"].AsInputFile();
834
835 vector< CRef< CSeq_entry > > SeqEntries;
836 try {
837 CAgpToSeqEntry Reader(CAgpToSeqEntry::fSetSeqGap );
838 Reader.SetVersion( eAgpVersion_1_1);
839 Reader.ReadStream(In);
840 SeqEntries.clear();
841 SeqEntries = Reader.GetResult();
842 //AgpRead(In, SeqEntries, eAgpRead_ParseId, true);
843 } catch(CException& e) {
844 cerr << "AgpRead Exception: " << e.ReportAll() << endl;
845 }
846
847 ITERATE(vector<CRef<CSeq_entry> >, SeqEntryIter, SeqEntries) {
848 //cerr << "Loading AGP: " << (*SeqEntryIter)->GetSeq().GetId().front()->AsFastaString() << endl;
849 m_Scope->AddTopLevelSeqEntry(**SeqEntryIter);
850 LoadedIds.push_back( (*SeqEntryIter)->GetSeq().GetId().front() );
851 }
852 }
853
854 }
855
856
x_RecurseSeqEntry(CRef<CSeq_entry> Top,list<CRef<CSeq_id>> & SeqIdList)857 void CNgAlignApp::x_RecurseSeqEntry(CRef<CSeq_entry> Top,
858 list<CRef<CSeq_id> >& SeqIdList)
859 {
860 if(Top->IsSet()) {
861 int Loop = 0;
862 ITERATE(CBioseq_set::TSeq_set, SeqIter, Top->GetSet().GetSeq_set()) {
863 //if(Loop > 1927)
864 // break;
865 //if(Loop > 1900)
866 x_RecurseSeqEntry(*SeqIter, SeqIdList);
867 Loop++;
868 }
869 } else if(Top->IsSeq()) {
870 CRef<CSeq_id> Id;
871 Id = Top->GetSeq().GetId().front();
872 SeqIdList.push_back(Id);
873 try {
874 m_Scope->AddTopLevelSeqEntry(*Top);
875 } catch(...) { ; }
876 }
877 }
878
879
x_AddScorers(CNgAligner & NgAligner,IRegistry * RunRegistry)880 void CNgAlignApp::x_AddScorers(CNgAligner& NgAligner, IRegistry* RunRegistry)
881 {
882 string ScorerNames;
883
884 if(GetArgs()["scorers"].HasValue())
885 ScorerNames = GetArgs()["scorers"].AsString();
886 else
887 ScorerNames = RunRegistry->Get("scorers", "names");
888
889 vector<string> Names;
890 NStr::Split(ScorerNames, " \t;", Names);
891
892 ITERATE(vector<string>, NameIter, Names) {
893
894 if(*NameIter == "blast")
895 NgAligner.AddScorer(
896 new CBlastScorer(CBlastScorer::eSkipUnsupportedAlignments));
897 else if(*NameIter == "pctident")
898 NgAligner.AddScorer(new CPctIdentScorer);
899 else if(*NameIter == "pctcov")
900 NgAligner.AddScorer(new CPctCoverageScorer);
901 else if(*NameIter == "comcomp")
902 NgAligner.AddScorer(new CCommonComponentScorer);
903 else if(*NameIter == "expansion")
904 NgAligner.AddScorer(new CExpansionScorer);
905 else if(*NameIter == "weighted") {
906 double K = 0.04;
907 K = RunRegistry->GetDouble("scorers", "sw_cvg", 0.04);
908 NgAligner.AddScorer(new CWeightedIdentityScorer(K));
909 }
910 else if(*NameIter == "hang")
911 NgAligner.AddScorer(new CHangScorer);
912 else if(*NameIter == "overlap")
913 NgAligner.AddScorer(new COverlapScorer);
914 else if(*NameIter == "clip") {
915 double K = 0.04;
916 K = RunRegistry->GetDouble("scorers", "sw_cvg", 0.04);
917 NgAligner.AddScorer(new CClippedScorer(K));
918 }
919
920 }
921 }
922
923
x_AddFilters(CNgAligner & NgAligner,IRegistry * RunRegistry)924 void CNgAlignApp::x_AddFilters(CNgAligner& NgAligner, IRegistry* RunRegistry)
925 {
926 const CArgs& args = GetArgs();
927
928 int Rank = 0;
929
930 if(args["filters"].HasValue()) {
931 string OrigFilters = args["filters"].AsString();
932 vector<string> SplitFilters;
933 NStr::Split(OrigFilters, ";", SplitFilters);
934 ITERATE(vector<string>, FilterIter, SplitFilters) {
935 const string FilterStr = *FilterIter;
936 try {
937 CRef<CQueryFilter> Filter(new CQueryFilter(Rank, FilterStr));
938 Rank++;
939 NgAligner.AddFilter(Filter);
940 } catch( CException& e) {
941 cerr << "x_AddFilters" << " : " << Rank << " : " << FilterStr << endl;
942 cerr << e.ReportAll() << endl;
943 }
944 }
945 } else {
946 string FilterNames = RunRegistry->Get("filters", "names");
947 vector<string> Names;
948 NStr::Split(FilterNames, " \t", Names);
949 ITERATE(vector<string>, NameIter, Names) {
950 string FilterStr = RunRegistry->Get("filters", *NameIter);
951 try {
952 CRef<CQueryFilter> Filter(new CQueryFilter(Rank, FilterStr));
953 Rank++;
954 NgAligner.AddFilter(Filter);
955 } catch( CException& e) {
956 cerr << "x_AddFilters" << " : " << *NameIter << " : " << FilterStr << endl;
957 cerr << e.ReportAll() << endl;
958 }
959 }
960 }
961 }
962
963
x_AddAligners(CNgAligner & NgAligner,IRegistry * RunRegistry)964 void CNgAlignApp::x_AddAligners(CNgAligner& NgAligner, IRegistry* RunRegistry)
965 {
966 string FilterNames = RunRegistry->Get("aligners", "names");
967 vector<string> Names;
968 NStr::Split(FilterNames, " \t", Names);
969
970 ITERATE(vector<string>, NameIter, Names) {
971
972 string Type = RunRegistry->Get(*NameIter, "type");
973
974 CRef<IAlignmentFactory> Aligner;
975 if(Type == "blast")
976 Aligner = x_CreateBlastAligner(RunRegistry, *NameIter);
977 else if(Type == "remote_blast")
978 Aligner = x_CreateRemoteBlastAligner(RunRegistry, *NameIter);
979 else if(Type == "merge")
980 Aligner = x_CreateMergeAligner(RunRegistry, *NameIter);
981 //else if(Type == "coverage")
982 // Aligner = x_CreateCoverageAligner(RunRegistry, *NameIter);
983 else if(Type == "inversion")
984 Aligner = x_CreateInversionMergeAligner(RunRegistry, *NameIter);
985 else if(Type == "split")
986 Aligner = x_CreateSplitSeqMergeAligner(RunRegistry, *NameIter);
987 else if(Type == "file")
988 Aligner = x_CreateFileLoadAligner(RunRegistry, *NameIter);
989
990 if(!Aligner.IsNull())
991 NgAligner.AddAligner(Aligner);
992 }
993 }
994
995
996 CRef<CBlastAligner>
x_CreateBlastAligner(IRegistry * RunRegistry,const string & Name)997 CNgAlignApp::x_CreateBlastAligner(IRegistry* RunRegistry, const string& Name)
998 {
999 string Params = RunRegistry->Get(Name, "params");
1000 int Threshold = RunRegistry->GetInt(Name, "threshold", 0);
1001 string Filter = RunRegistry->Get(Name, "filter");
1002 bool UseNegatives = RunRegistry->GetBool(Name, "useneg", true);
1003
1004 const CArgs& Args = GetArgs();
1005
1006
1007 if(Args["softfilter"].HasValue())
1008 Filter = Args["softfilter"].AsString();
1009
1010
1011
1012 CRef<CBlastAligner> Blaster;
1013 Blaster.Reset(new CBlastAligner(Params, Threshold));
1014
1015 //if(Filter != -1)
1016 Blaster->SetSoftFiltering(Filter);
1017
1018 Blaster->SetUseNegativeGiList(UseNegatives);
1019
1020 return Blaster;
1021 }
1022
1023
1024 CRef<CRemoteBlastAligner>
x_CreateRemoteBlastAligner(IRegistry * RunRegistry,const string & Name)1025 CNgAlignApp::x_CreateRemoteBlastAligner(IRegistry* RunRegistry, const string& Name)
1026 {
1027 string Params = RunRegistry->Get(Name, "params");
1028 int Threshold = RunRegistry->GetInt(Name, "threshold", 0);
1029 //int Filter = RunRegistry->GetInt(Name, "filter", -1);
1030
1031 //const CArgs& Args = GetArgs();
1032
1033
1034 CRef<CRemoteBlastAligner> Blaster;
1035 Blaster.Reset(new CRemoteBlastAligner(Params, Threshold));
1036
1037 //if(Filter != -1)
1038 // Blaster->SetSoftFiltering(Filter);
1039
1040 return Blaster;
1041 }
1042
1043
1044 CRef<CMergeAligner>
x_CreateMergeAligner(IRegistry * RunRegistry,const string & Name)1045 CNgAlignApp::x_CreateMergeAligner(IRegistry* RunRegistry, const string& Name)
1046 {
1047 int Threshold = RunRegistry->GetInt(Name, "threshold", 0);
1048 double Clip = RunRegistry->GetDouble(Name, "clip", 3.0);
1049 string ModeStr = RunRegistry->Get(Name, "mode");
1050 CMergeAligner::TMode Mode = CMergeAligner::eDefault;
1051 if(ModeStr == "cleanup")
1052 Mode = CMergeAligner::eAlignCleanup;
1053 else if(ModeStr == "tree")
1054 Mode = CMergeAligner::eTreeAlignMerger;
1055
1056 return CRef<CMergeAligner>(new CMergeAligner(Threshold, Mode/*, Clip*/));
1057 }
1058
1059 /*CRef<CCoverageAligner>
1060 CNgAlignApp::x_CreateCoverageAligner(IRegistry* RunRegistry, const string& Name)
1061 {
1062 int Threshold = RunRegistry->GetInt(Name, "threshold", 0);
1063 return CRef<CCoverageAligner>(new CCoverageAligner(Threshold));
1064 }*/
1065
1066
1067 CRef<CInversionMergeAligner>
x_CreateInversionMergeAligner(IRegistry * RunRegistry,const string & Name)1068 CNgAlignApp::x_CreateInversionMergeAligner(IRegistry* RunRegistry, const string& Name)
1069 {
1070 int Threshold = RunRegistry->GetInt(Name, "threshold", 0);
1071 return CRef<CInversionMergeAligner>(new CInversionMergeAligner(Threshold));
1072 }
1073
1074
1075 CRef<CSplitSeqAlignMerger>
x_CreateSplitSeqMergeAligner(IRegistry * RunRegistry,const string & Name)1076 CNgAlignApp::x_CreateSplitSeqMergeAligner(IRegistry* RunRegistry, const string& Name)
1077 {
1078 return CRef<CSplitSeqAlignMerger>(new CSplitSeqAlignMerger(m_Splitter.get()));
1079 }
1080
1081
1082 CRef<CFileLoadAligner>
x_CreateFileLoadAligner(IRegistry * RunRegistry,const string & Name)1083 CNgAlignApp::x_CreateFileLoadAligner(IRegistry* RunRegistry, const string& Name)
1084 {
1085 //int Threshold = RunRegistry->GetInt(Name, "threshold", 0);
1086 string FileNameStr = RunRegistry->Get(Name, "filename");
1087 cerr << "x_CreateFileLoadAligner : " << FileNameStr << endl;
1088 return CRef<CFileLoadAligner>(new CFileLoadAligner(FileNameStr));
1089 }
1090
1091
x_LoadAssembly(CNcbiIstream & In)1092 CRef<CGC_Assembly> CNgAlignApp::x_LoadAssembly(CNcbiIstream& In)
1093 {
1094 CRef<CGC_Assembly> Assembly(new CGC_Assembly);
1095
1096 try {
1097 In >> MSerial_AsnText >> *Assembly;
1098 } catch(...) {
1099 In.clear();
1100 In.seekg(0);
1101 try {
1102 In >> MSerial_AsnBinary >> *Assembly;
1103 } catch(...) {
1104 Assembly.Reset(NULL);
1105 }
1106 }
1107
1108 return Assembly;
1109 }
1110
1111
x_PrintNoHitList(CNcbiOstream & Out,const CSeq_align_set & Alignments)1112 void CNgAlignApp::x_PrintNoHitList(CNcbiOstream& Out, const CSeq_align_set& Alignments)
1113 {
1114 set<CSeq_id_Handle> GivenIds;
1115
1116 ITERATE(list<CRef<CSeq_id> >, IdIter, m_LoadedIds) {
1117 CSeq_id_Handle Handle = CSeq_id_Handle::GetHandle(**IdIter);
1118 GivenIds.insert(Handle);
1119 }
1120
1121 ITERATE(CSeq_align_set::Tdata, AlignIter, Alignments.Get()) {
1122 CSeq_id_Handle AlignIdHandle = CSeq_id_Handle::GetHandle((*AlignIter)->GetSeq_id(0));
1123 set<CSeq_id_Handle>::iterator Found = GivenIds.find(AlignIdHandle);
1124 if(Found != GivenIds.end())
1125 GivenIds.erase(Found);
1126 }
1127
1128 ITERATE(set<CSeq_id_Handle>, IdIter, GivenIds) {
1129 Out << IdIter->AsString() << endl;
1130 }
1131
1132 }
1133
1134
1135 END_NCBI_SCOPE
1136 USING_SCOPE(ncbi);
1137
1138
main(int argc,char ** argv)1139 int main(int argc, char** argv)
1140 {
1141 return CNgAlignApp().AppMain(argc, argv, 0, eDS_Default, 0);
1142 }
1143