1 /* $Id: winmask_filter.cpp 500404 2016-05-04 14:59:01Z camacho $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Kevin Bealer
27 *
28 * Initial Version Creation Date: April 17th, 2008
29 *
30 * File Description:
31 * Blast wrappers for WindowMasker filtering.
32 *
33 * */
34
35 /// @file winmask_filter.cpp
36 /// Blast wrappers for WindowMasker filtering.
37 #include <ncbi_pch.hpp>
38 #include "winmask_filter.hpp"
39 #include <sstream>
40 #include <serial/iterator.hpp>
41 #include <objects/seqloc/Seq_loc.hpp>
42 #include <objects/seqalign/Seq_align.hpp>
43 #include <objects/seqalign/Seq_align_set.hpp>
44 #include <objects/seqalign/Dense_seg.hpp>
45 #include <objmgr/util/sequence.hpp>
46 #include <algo/blast/api/blast_types.hpp>
47
48 #include <algo/blast/api/seqsrc_seqdb.hpp>
49
50 #include <algo/blast/api/local_blast.hpp>
51 #include <algo/blast/api/objmgr_query_data.hpp>
52 #include <algo/blast/api/blast_nucl_options.hpp>
53 #include <algo/blast/api/windowmask_filter.hpp>
54 #include "blast_setup.hpp"
55
56 #include <algo/blast/core/blast_seqsrc.h>
57 #include <algo/blast/core/blast_hits.h>
58 #include <algo/blast/core/blast_filter.h>
59
60 #include <algo/blast/api/blast_aux.hpp>
61 #include <algo/winmask/seq_masker.hpp>
62 #include <corelib/env_reg.hpp>
63
64 /** @addtogroup AlgoBlast
65 *
66 * @{
67 */
68
69 BEGIN_NCBI_SCOPE
70 USING_SCOPE(objects);
71 BEGIN_SCOPE(blast)
72
73 static string s_WINDOW_MASKER_STAT_FILE_NAME("wmasker.obinary");
74 static string s_WINDOW_MASKER_PATH(kEmptyStr);
75 DEFINE_STATIC_MUTEX(InitMutex);
76
WindowMaskerPathInit(const string & window_masker_path)77 int WindowMaskerPathInit(const string& window_masker_path)
78 {
79 if (CDirEntry(window_masker_path).GetType() != CDirEntry::eDir) {
80 return 1;
81 }
82 {
83 CMutexGuard guard(InitMutex);
84 s_WINDOW_MASKER_PATH = window_masker_path;
85 }
86 return 0;
87 }
WindowMaskerPathReset()88 void WindowMaskerPathReset()
89 {
90 CMutexGuard guard(InitMutex);
91 s_WINDOW_MASKER_PATH.clear();
92 }
WindowMaskerPathGet()93 string WindowMaskerPathGet()
94 {
95 CMutexGuard guard(InitMutex);
96 return s_WINDOW_MASKER_PATH;
97 }
98
s_BuildSeqMasker(const string & lstat)99 CSeqMasker* s_BuildSeqMasker(const string & lstat)
100 {
101 Uint1 arg_window_size = 0; // [allow setting of this field?]
102 Uint4 arg_window_step = 1;
103 Uint1 arg_unit_step = 1;
104 Uint4 arg_textend = 0; // [allow setting of this field?]
105 Uint4 arg_cutoff_score = 0; // [allow setting of this field?]
106 Uint4 arg_max_score = 0; // [allow setting of this field?]
107 Uint4 arg_min_score = 0; // [allow setting of this field?]
108 Uint4 arg_set_max_score = 0; // [allow setting of this field?]
109 Uint4 arg_set_min_score = 0; // [allow setting of this field?]
110 bool arg_merge_pass = false;
111 Uint4 arg_merge_cutoff_score = 0;
112 Uint4 arg_abs_merge_cutoff_dist = 0;
113 Uint4 arg_mean_merge_cutoff_dist = 0;
114 Uint1 arg_merge_unit_step = 0;
115 const string & arg_trigger = "mean";
116 Uint1 tmin_count = 0;
117 bool arg_discontig = false;
118 Uint4 arg_pattern = 0;
119
120 // enable/disable some kind of optimization
121 bool arg_use_ba = true;
122
123 // Get a sequence masker.
124
125 CSeqMasker* masker = NULL;
126
127 try {
128 masker = new CSeqMasker( lstat,
129 arg_window_size,
130 arg_window_step,
131 arg_unit_step,
132 arg_textend,
133 arg_cutoff_score,
134 arg_max_score,
135 arg_min_score,
136 arg_set_max_score,
137 arg_set_min_score,
138 arg_merge_pass,
139 arg_merge_cutoff_score,
140 arg_abs_merge_cutoff_dist,
141 arg_mean_merge_cutoff_dist,
142 arg_merge_unit_step,
143 arg_trigger,
144 tmin_count,
145 arg_discontig,
146 arg_pattern,
147 arg_use_ba );
148 }
149 catch(CException & e) {
150 NCBI_THROW(CBlastException, eSetup, e.what());
151 }
152
153 return masker;
154 }
155
s_BuildMaskedRanges(CSeqMasker::TMaskList & masks,const CSeq_loc & seqloc,CSeq_id & query_id,TMaskedQueryRegions * mqr,CRef<CSeq_loc> * psl)156 void s_BuildMaskedRanges(CSeqMasker::TMaskList & masks,
157 const CSeq_loc & seqloc,
158 CSeq_id & query_id,
159 TMaskedQueryRegions * mqr,
160 CRef<CSeq_loc> * psl)
161 {
162 TSeqPos query_start = seqloc.GetStart(eExtreme_Positional);
163
164 // This needs to be examined further for places where a +1, -1,
165 // etc is needed due to biological vs. computer science offset
166 // notations.
167
168 ITERATE(CSeqMasker::TMaskList, pr, masks) {
169 CRef<CSeq_interval> ival(new CSeq_interval);
170
171 TSeqPos
172 start = pr->first,
173 end = pr->second;
174
175 ival->SetFrom (query_start + start);
176 ival->SetTo (query_start + end);
177 ival->SetId (query_id);
178 ival->SetStrand(eNa_strand_both);
179
180 if (mqr) {
181 CRef<CSeqLocInfo> info_plus
182 (new CSeqLocInfo(&* ival, CSeqLocInfo::eFramePlus1));
183 mqr->push_back(info_plus);
184
185 CRef<CSeqLocInfo> info_minus
186 (new CSeqLocInfo(&* ival, CSeqLocInfo::eFrameMinus1));
187 mqr->push_back(info_minus);
188 }
189
190 if (psl) {
191 if (psl->Empty()) {
192 psl->Reset(new CSeq_loc);
193 }
194 (**psl).SetPacked_int().Set().push_back(ival);
195 }
196 }
197 if (psl && !psl->Empty())
198 {
199 const int kTopFlags = CSeq_loc::fStrand_Ignore|CSeq_loc::fMerge_All|CSeq_loc::fSort;
200 CRef<CSeq_loc> tmp = (*psl)->Merge(kTopFlags, 0);
201 psl->Reset(tmp);
202 (*psl)->ChangeToPackedInt();
203 }
204
205 }
206
207 // These templates only exist to reduce code duplication due to the
208 // TSeqLocVector / BlastQueryVector split. By parameterizing on the
209 // query container type, several functions can call these templates
210 // with different types of queries and options handles, and the
211 // appropriate number of "glue" functions will be generated to call
212 // the actual taxid / filename based implementations.
213
214 template<class TQueries>
215 void
Blast_FindWindowMaskerLoc_Fwd(TQueries & query,const CBlastOptions * opts)216 Blast_FindWindowMaskerLoc_Fwd(TQueries & query,
217 const CBlastOptions * opts)
218 {
219 if (! opts)
220 return;
221
222 if (opts->GetWindowMaskerDatabase()) {
223 Blast_FindWindowMaskerLoc(query, opts->GetWindowMaskerDatabase());
224 } else if (opts->GetWindowMaskerTaxId()) {
225 Blast_FindWindowMaskerLocTaxId(query, opts->GetWindowMaskerTaxId());
226 }
227 }
228
229 template<class TQueries>
230 void
Blast_FindWindowMaskerLoc_Fwd(TQueries & query,const CBlastOptionsHandle * opts_handle)231 Blast_FindWindowMaskerLoc_Fwd(TQueries & query,
232 const CBlastOptionsHandle * opts_handle)
233 {
234 if (! opts_handle)
235 return;
236
237 Blast_FindWindowMaskerLoc_Fwd(query, & opts_handle->GetOptions());
238 }
239
240 // These four functions exist to provide non-template public
241 // interfaces; the work is done in the two templates above this to
242 // reduce duplication.
243
244 void
Blast_FindWindowMaskerLoc(CBlastQueryVector & query,const CBlastOptions * opts)245 Blast_FindWindowMaskerLoc(CBlastQueryVector & query,
246 const CBlastOptions * opts)
247 {
248 Blast_FindWindowMaskerLoc_Fwd(query, opts);
249 }
250
251 void
Blast_FindWindowMaskerLoc(TSeqLocVector & query,const CBlastOptions * opts)252 Blast_FindWindowMaskerLoc(TSeqLocVector & query,
253 const CBlastOptions * opts)
254 {
255 Blast_FindWindowMaskerLoc_Fwd(query, opts);
256 }
257
258 void
Blast_FindWindowMaskerLoc(CBlastQueryVector & query,const CBlastOptionsHandle * opts)259 Blast_FindWindowMaskerLoc(CBlastQueryVector & query,
260 const CBlastOptionsHandle * opts)
261 {
262 Blast_FindWindowMaskerLoc_Fwd(query, opts);
263 }
264
265 void
Blast_FindWindowMaskerLoc(TSeqLocVector & query,const CBlastOptionsHandle * opts)266 Blast_FindWindowMaskerLoc(TSeqLocVector & query,
267 const CBlastOptionsHandle * opts)
268 {
269 Blast_FindWindowMaskerLoc_Fwd(query, opts);
270 }
271
272 // These two functions do the actual work. If either is changed, the
273 // other should be too. The TSeqLocVector vs. BlastQueryVector
274 // differences could be factored out into a wrapper that isolates the
275 // differences so that the algorithm is not duplicated. Another
276 // alternative is to (continue to) replace TSeqLocVector with
277 // CBlastQueryVector as was originally planned.
278
279 void
Blast_FindWindowMaskerLoc(CBlastQueryVector & queries,const string & lstat)280 Blast_FindWindowMaskerLoc(CBlastQueryVector & queries, const string & lstat)
281 {
282 AutoPtr<CSeqMasker> masker(s_BuildSeqMasker(lstat));
283
284 for(size_t j = 0; j < queries.Size(); j++) {
285 CBlastSearchQuery & query = *queries.GetBlastSearchQuery(j);
286
287 // Get SeqVector, query Seq-id, and range.
288
289 CConstRef<CSeq_loc> seqloc = query.GetQuerySeqLoc();
290
291 CSeqVector psv(*seqloc,
292 *queries.GetScope(j),
293 CBioseq_Handle::eCoding_Iupac,
294 eNa_strand_plus);
295
296 CRef<CSeq_id> query_seq_id(new CSeq_id);
297 query_seq_id->Assign(*seqloc->GetId());
298
299 // Mask the query.
300
301 AutoPtr<CSeqMasker::TMaskList> pos_masks((*masker)(psv));
302
303 TMaskedQueryRegions mqr;
304
305 s_BuildMaskedRanges(*pos_masks,
306 *seqloc,
307 *query_seq_id,
308 & mqr,
309 0);
310
311 query.SetMaskedRegions(mqr);
312 }
313 }
314
315 void
Blast_FindWindowMaskerLoc(TSeqLocVector & queries,const string & lstat)316 Blast_FindWindowMaskerLoc(TSeqLocVector & queries, const string & lstat)
317 {
318 AutoPtr<CSeqMasker> masker(s_BuildSeqMasker(lstat));
319
320 for(size_t j = 0; j < queries.size(); j++) {
321 // Get SeqVector, query Seq-id, and range.
322
323 CConstRef<CSeq_loc> seqloc = queries[j].seqloc;
324
325 CSeqVector psv(*seqloc,
326 *queries[j].scope,
327 CBioseq_Handle::eCoding_Iupac,
328 eNa_strand_plus);
329
330 CRef<CSeq_id> query_seq_id(new CSeq_id);
331 query_seq_id->Assign(*seqloc->GetId());
332
333 // Mask the query.
334
335 AutoPtr<CSeqMasker::TMaskList> pos_masks((*masker)(psv));
336
337 s_BuildMaskedRanges(*pos_masks,
338 *seqloc,
339 *query_seq_id,
340 0,
341 & queries[j].mask);
342
343 if( queries[0].mask ) {
344 CPacked_seqint::Tdata & seqint_list =
345 queries[0].mask->SetPacked_int().Set();
346
347 NON_CONST_ITERATE(CPacked_seqint::Tdata, itr, seqint_list) {
348 if ((*itr)->CanGetStrand()) {
349 switch((*itr)->GetStrand()) {
350 case eNa_strand_unknown:
351 case eNa_strand_both:
352 case eNa_strand_plus:
353 (*itr)->ResetStrand();
354 break;
355
356 default:
357 break;
358 }
359 }
360 }
361 }
362 }
363 }
364
365 /// Find the path to the window masker files, first checking the (optionally
366 /// set) value passed to the WindowMaskerPathInit function, then the
367 /// environment variable WINDOW_MASKER_PATH, then the section WINDOW_MASKER,
368 /// label WINDOW_MASKER_PATH in the NCBI configuration file. If not found in
369 /// either location, return the current working directory
370 /// @sa s_FindPathToGeneInfoFiles
371 static string
s_FindPathToWM(void)372 s_FindPathToWM(void)
373 {
374 string retval = WindowMaskerPathGet();
375 if ( !retval.empty() ) {
376 return retval;
377 }
378 const string kEnvVar("WINDOW_MASKER_PATH");
379 const string kSection("WINDOW_MASKER");
380 CNcbiIstrstream empty_stream(kEmptyCStr);
381 CRef<CNcbiRegistry> reg(new CNcbiRegistry(empty_stream,
382 IRegistry::fWithNcbirc));
383 CRef<CSimpleEnvRegMapper> mapper(new CSimpleEnvRegMapper(kSection,
384 kEmptyStr));
385 CRef<CEnvironmentRegistry> env_reg(new CEnvironmentRegistry);
386 env_reg->AddMapper(*mapper, CEnvironmentRegistry::ePriority_Max);
387 reg->Add(*env_reg, CNcbiRegistry::ePriority_MaxUser);
388 retval = reg->Get(kSection, kEnvVar);
389 if (retval == kEmptyStr) {
390 retval = CDir::GetCwd();
391 }
392 #if defined(NCBI_OS_MSWIN)
393 // We address this here otherwise CDirEntry::IsAbsolutePath() fails
394 if (NStr::StartsWith(retval, "//")) {
395 NStr::ReplaceInPlace(retval, "//", "\\\\");
396 }
397 #endif
398 return retval;
399 }
400
WindowMaskerTaxidToDb(const string & window_masker_path,int taxid)401 string WindowMaskerTaxidToDb(const string& window_masker_path, int taxid)
402 {
403 string path = window_masker_path;
404 path += CFile::GetPathSeparator() + NStr::IntToString(taxid)
405 + CFile::GetPathSeparator();
406 const string binpath = path + s_WINDOW_MASKER_STAT_FILE_NAME;
407 return (CFile(binpath).Exists() ? binpath : kEmptyStr);
408 }
409
410 /* Unit test is in bl2seq_unit_test.cpp */
WindowMaskerTaxidToDb(int taxid)411 string WindowMaskerTaxidToDb(int taxid)
412 {
413 string path = s_FindPathToWM();
414 return WindowMaskerTaxidToDb(path, taxid);
415 }
416
417 void
Blast_FindWindowMaskerLocTaxId(CBlastQueryVector & queries,int taxid)418 Blast_FindWindowMaskerLocTaxId(CBlastQueryVector & queries, int taxid)
419 {
420 string db = WindowMaskerTaxidToDb(taxid);
421 Blast_FindWindowMaskerLoc(queries, db);
422 }
423
424 void
Blast_FindWindowMaskerLocTaxId(TSeqLocVector & queries,int taxid)425 Blast_FindWindowMaskerLocTaxId(TSeqLocVector & queries, int taxid)
426 {
427 string db = WindowMaskerTaxidToDb(taxid);
428 Blast_FindWindowMaskerLoc(queries, db);
429 }
430
s_OldGetTaxIdWithWindowMaskerSupport(set<int> & supported_taxids)431 static void s_OldGetTaxIdWithWindowMaskerSupport(set<int>& supported_taxids)
432 {
433 supported_taxids.clear();
434 CNcbiOstrstream oss;
435 const string wmpath = s_FindPathToWM();
436 oss << wmpath << CFile::GetPathSeparator() << "*"
437 << CFile::GetPathSeparator() << "*.*"
438 << CFile::GetPathSeparator() << s_WINDOW_MASKER_STAT_FILE_NAME;
439 const string path = CNcbiOstrstreamToString(oss);
440
441 list<string> builds;
442 FindFiles(path, builds, fFF_File);
443 NON_CONST_ITERATE(list<string>, path, builds) {
444 // remove the WindowMasker path and path separator
445 path->erase(0, wmpath.size() + 1);
446 // then remove the remaining path
447 const size_t pos = path->find(CFile::GetPathSeparator());
448 path->erase(pos);
449 const int taxid = NStr::StringToInt(*path, NStr::fConvErr_NoThrow);
450 supported_taxids.insert(taxid);
451 }
452 }
453
GetTaxIdWithWindowMaskerSupport(set<int> & supported_taxids)454 void GetTaxIdWithWindowMaskerSupport(set<int>& supported_taxids)
455 {
456 supported_taxids.clear();
457 CNcbiOstrstream oss;
458 const string wmpath = s_FindPathToWM();
459 oss << wmpath << CFile::GetPathSeparator() << "*"
460 << CFile::GetPathSeparator() << s_WINDOW_MASKER_STAT_FILE_NAME;
461 const string path = CNcbiOstrstreamToString(oss);
462
463 list<string> builds;
464 FindFiles(path, builds, fFF_File);
465 NON_CONST_ITERATE(list<string>, path, builds) {
466 // remove the WindowMasker path and path separator
467 path->erase(0, wmpath.size() + 1);
468 // then remove the remaining path
469 const size_t pos = path->find(CFile::GetPathSeparator());
470 path->erase(pos);
471 const int taxid = NStr::StringToInt(*path, NStr::fConvErr_NoThrow);
472 supported_taxids.insert(taxid);
473 }
474
475 if (supported_taxids.empty()) {
476 s_OldGetTaxIdWithWindowMaskerSupport(supported_taxids);
477 }
478 }
479
480 END_SCOPE(blast)
481 END_NCBI_SCOPE
482
483 /* @} */
484