1 /*  $Id: winmask_filter.cpp 500404 2016-05-04 14:59:01Z camacho $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Kevin Bealer
27  *
28  * Initial Version Creation Date:  April 17th, 2008
29  *
30  * File Description:
31  *     Blast wrappers for WindowMasker filtering.
32  *
33  * */
34 
35 /// @file winmask_filter.cpp
36 /// Blast wrappers for WindowMasker filtering.
37 #include <ncbi_pch.hpp>
38 #include "winmask_filter.hpp"
39 #include <sstream>
40 #include <serial/iterator.hpp>
41 #include <objects/seqloc/Seq_loc.hpp>
42 #include <objects/seqalign/Seq_align.hpp>
43 #include <objects/seqalign/Seq_align_set.hpp>
44 #include <objects/seqalign/Dense_seg.hpp>
45 #include <objmgr/util/sequence.hpp>
46 #include <algo/blast/api/blast_types.hpp>
47 
48 #include <algo/blast/api/seqsrc_seqdb.hpp>
49 
50 #include <algo/blast/api/local_blast.hpp>
51 #include <algo/blast/api/objmgr_query_data.hpp>
52 #include <algo/blast/api/blast_nucl_options.hpp>
53 #include <algo/blast/api/windowmask_filter.hpp>
54 #include "blast_setup.hpp"
55 
56 #include <algo/blast/core/blast_seqsrc.h>
57 #include <algo/blast/core/blast_hits.h>
58 #include <algo/blast/core/blast_filter.h>
59 
60 #include <algo/blast/api/blast_aux.hpp>
61 #include <algo/winmask/seq_masker.hpp>
62 #include <corelib/env_reg.hpp>
63 
64 /** @addtogroup AlgoBlast
65  *
66  * @{
67  */
68 
69 BEGIN_NCBI_SCOPE
70 USING_SCOPE(objects);
71 BEGIN_SCOPE(blast)
72 
73 static string s_WINDOW_MASKER_STAT_FILE_NAME("wmasker.obinary");
74 static string s_WINDOW_MASKER_PATH(kEmptyStr);
75 DEFINE_STATIC_MUTEX(InitMutex);
76 
WindowMaskerPathInit(const string & window_masker_path)77 int WindowMaskerPathInit(const string& window_masker_path)
78 {
79     if (CDirEntry(window_masker_path).GetType() != CDirEntry::eDir) {
80         return 1;
81     }
82     {
83         CMutexGuard guard(InitMutex);
84         s_WINDOW_MASKER_PATH = window_masker_path;
85     }
86     return 0;
87 }
WindowMaskerPathReset()88 void WindowMaskerPathReset()
89 {
90     CMutexGuard guard(InitMutex);
91     s_WINDOW_MASKER_PATH.clear();
92 }
WindowMaskerPathGet()93 string WindowMaskerPathGet()
94 {
95     CMutexGuard guard(InitMutex);
96     return s_WINDOW_MASKER_PATH;
97 }
98 
s_BuildSeqMasker(const string & lstat)99 CSeqMasker* s_BuildSeqMasker(const string & lstat)
100 {
101     Uint1 arg_window_size            = 0; // [allow setting of this field?]
102     Uint4 arg_window_step            = 1;
103     Uint1 arg_unit_step              = 1;
104     Uint4 arg_textend                = 0; // [allow setting of this field?]
105     Uint4 arg_cutoff_score           = 0; // [allow setting of this field?]
106     Uint4 arg_max_score              = 0; // [allow setting of this field?]
107     Uint4 arg_min_score              = 0; // [allow setting of this field?]
108     Uint4 arg_set_max_score          = 0; // [allow setting of this field?]
109     Uint4 arg_set_min_score          = 0; // [allow setting of this field?]
110     bool  arg_merge_pass             = false;
111     Uint4 arg_merge_cutoff_score     = 0;
112     Uint4 arg_abs_merge_cutoff_dist  = 0;
113     Uint4 arg_mean_merge_cutoff_dist = 0;
114     Uint1 arg_merge_unit_step        = 0;
115     const string & arg_trigger       = "mean";
116     Uint1 tmin_count                 = 0;
117     bool  arg_discontig              = false;
118     Uint4 arg_pattern                = 0;
119 
120     // enable/disable some kind of optimization
121     bool  arg_use_ba                 = true;
122 
123     // Get a sequence masker.
124 
125     CSeqMasker* masker = NULL;
126 
127     try {
128         masker = new CSeqMasker( lstat,
129                                  arg_window_size,
130                                  arg_window_step,
131                                  arg_unit_step,
132                                  arg_textend,
133                                  arg_cutoff_score,
134                                  arg_max_score,
135                                  arg_min_score,
136                                  arg_set_max_score,
137                                  arg_set_min_score,
138                                  arg_merge_pass,
139                                  arg_merge_cutoff_score,
140                                  arg_abs_merge_cutoff_dist,
141                                  arg_mean_merge_cutoff_dist,
142                                  arg_merge_unit_step,
143                                  arg_trigger,
144                                  tmin_count,
145                                  arg_discontig,
146                                  arg_pattern,
147                                  arg_use_ba );
148     }
149     catch(CException & e) {
150         NCBI_THROW(CBlastException, eSetup, e.what());
151     }
152 
153     return masker;
154 }
155 
s_BuildMaskedRanges(CSeqMasker::TMaskList & masks,const CSeq_loc & seqloc,CSeq_id & query_id,TMaskedQueryRegions * mqr,CRef<CSeq_loc> * psl)156 void s_BuildMaskedRanges(CSeqMasker::TMaskList & masks,
157                          const CSeq_loc        & seqloc,
158                          CSeq_id               & query_id,
159                          TMaskedQueryRegions   * mqr,
160                          CRef<CSeq_loc>        * psl)
161 {
162     TSeqPos query_start = seqloc.GetStart(eExtreme_Positional);
163 
164     // This needs to be examined further for places where a +1, -1,
165     // etc is needed due to biological vs. computer science offset
166     // notations.
167 
168     ITERATE(CSeqMasker::TMaskList, pr, masks) {
169         CRef<CSeq_interval> ival(new CSeq_interval);
170 
171         TSeqPos
172             start  = pr->first,
173             end    = pr->second;
174 
175         ival->SetFrom (query_start + start);
176         ival->SetTo   (query_start + end);
177         ival->SetId   (query_id);
178         ival->SetStrand(eNa_strand_both);
179 
180         if (mqr) {
181             CRef<CSeqLocInfo> info_plus
182                 (new CSeqLocInfo(&* ival, CSeqLocInfo::eFramePlus1));
183             mqr->push_back(info_plus);
184 
185             CRef<CSeqLocInfo> info_minus
186                 (new CSeqLocInfo(&* ival, CSeqLocInfo::eFrameMinus1));
187             mqr->push_back(info_minus);
188         }
189 
190         if (psl) {
191             if (psl->Empty()) {
192                 psl->Reset(new CSeq_loc);
193             }
194             (**psl).SetPacked_int().Set().push_back(ival);
195         }
196     }
197     if (psl && !psl->Empty())
198     {
199         const int kTopFlags = CSeq_loc::fStrand_Ignore|CSeq_loc::fMerge_All|CSeq_loc::fSort;
200         CRef<CSeq_loc> tmp = (*psl)->Merge(kTopFlags, 0);
201         psl->Reset(tmp);
202         (*psl)->ChangeToPackedInt();
203     }
204 
205 }
206 
207 // These templates only exist to reduce code duplication due to the
208 // TSeqLocVector / BlastQueryVector split.  By parameterizing on the
209 // query container type, several functions can call these templates
210 // with different types of queries and options handles, and the
211 // appropriate number of "glue" functions will be generated to call
212 // the actual taxid / filename based implementations.
213 
214 template<class TQueries>
215 void
Blast_FindWindowMaskerLoc_Fwd(TQueries & query,const CBlastOptions * opts)216 Blast_FindWindowMaskerLoc_Fwd(TQueries            & query,
217                               const CBlastOptions * opts)
218 {
219     if (! opts)
220         return;
221 
222     if (opts->GetWindowMaskerDatabase()) {
223         Blast_FindWindowMaskerLoc(query, opts->GetWindowMaskerDatabase());
224     } else if (opts->GetWindowMaskerTaxId()) {
225         Blast_FindWindowMaskerLocTaxId(query, opts->GetWindowMaskerTaxId());
226     }
227 }
228 
229 template<class TQueries>
230 void
Blast_FindWindowMaskerLoc_Fwd(TQueries & query,const CBlastOptionsHandle * opts_handle)231 Blast_FindWindowMaskerLoc_Fwd(TQueries                  & query,
232                               const CBlastOptionsHandle * opts_handle)
233 {
234     if (! opts_handle)
235         return;
236 
237     Blast_FindWindowMaskerLoc_Fwd(query, & opts_handle->GetOptions());
238 }
239 
240 // These four functions exist to provide non-template public
241 // interfaces; the work is done in the two templates above this to
242 // reduce duplication.
243 
244 void
Blast_FindWindowMaskerLoc(CBlastQueryVector & query,const CBlastOptions * opts)245 Blast_FindWindowMaskerLoc(CBlastQueryVector   & query,
246                           const CBlastOptions * opts)
247 {
248     Blast_FindWindowMaskerLoc_Fwd(query, opts);
249 }
250 
251 void
Blast_FindWindowMaskerLoc(TSeqLocVector & query,const CBlastOptions * opts)252 Blast_FindWindowMaskerLoc(TSeqLocVector       & query,
253                           const CBlastOptions * opts)
254 {
255     Blast_FindWindowMaskerLoc_Fwd(query, opts);
256 }
257 
258 void
Blast_FindWindowMaskerLoc(CBlastQueryVector & query,const CBlastOptionsHandle * opts)259 Blast_FindWindowMaskerLoc(CBlastQueryVector         & query,
260                           const CBlastOptionsHandle * opts)
261 {
262     Blast_FindWindowMaskerLoc_Fwd(query, opts);
263 }
264 
265 void
Blast_FindWindowMaskerLoc(TSeqLocVector & query,const CBlastOptionsHandle * opts)266 Blast_FindWindowMaskerLoc(TSeqLocVector             & query,
267                           const CBlastOptionsHandle * opts)
268 {
269     Blast_FindWindowMaskerLoc_Fwd(query, opts);
270 }
271 
272 // These two functions do the actual work.  If either is changed, the
273 // other should be too.  The TSeqLocVector vs. BlastQueryVector
274 // differences could be factored out into a wrapper that isolates the
275 // differences so that the algorithm is not duplicated.  Another
276 // alternative is to (continue to) replace TSeqLocVector with
277 // CBlastQueryVector as was originally planned.
278 
279 void
Blast_FindWindowMaskerLoc(CBlastQueryVector & queries,const string & lstat)280 Blast_FindWindowMaskerLoc(CBlastQueryVector & queries, const string & lstat)
281 {
282     AutoPtr<CSeqMasker> masker(s_BuildSeqMasker(lstat));
283 
284     for(size_t j = 0; j < queries.Size(); j++) {
285         CBlastSearchQuery & query = *queries.GetBlastSearchQuery(j);
286 
287         // Get SeqVector, query Seq-id, and range.
288 
289         CConstRef<CSeq_loc> seqloc = query.GetQuerySeqLoc();
290 
291         CSeqVector psv(*seqloc,
292                        *queries.GetScope(j),
293                        CBioseq_Handle::eCoding_Iupac,
294                        eNa_strand_plus);
295 
296         CRef<CSeq_id> query_seq_id(new CSeq_id);
297         query_seq_id->Assign(*seqloc->GetId());
298 
299         // Mask the query.
300 
301         AutoPtr<CSeqMasker::TMaskList> pos_masks((*masker)(psv));
302 
303         TMaskedQueryRegions mqr;
304 
305         s_BuildMaskedRanges(*pos_masks,
306                             *seqloc,
307                             *query_seq_id,
308                             & mqr,
309                             0);
310 
311         query.SetMaskedRegions(mqr);
312     }
313 }
314 
315 void
Blast_FindWindowMaskerLoc(TSeqLocVector & queries,const string & lstat)316 Blast_FindWindowMaskerLoc(TSeqLocVector & queries, const string & lstat)
317 {
318     AutoPtr<CSeqMasker> masker(s_BuildSeqMasker(lstat));
319 
320     for(size_t j = 0; j < queries.size(); j++) {
321         // Get SeqVector, query Seq-id, and range.
322 
323         CConstRef<CSeq_loc> seqloc = queries[j].seqloc;
324 
325         CSeqVector psv(*seqloc,
326                        *queries[j].scope,
327                        CBioseq_Handle::eCoding_Iupac,
328                        eNa_strand_plus);
329 
330         CRef<CSeq_id> query_seq_id(new CSeq_id);
331         query_seq_id->Assign(*seqloc->GetId());
332 
333         // Mask the query.
334 
335         AutoPtr<CSeqMasker::TMaskList> pos_masks((*masker)(psv));
336 
337         s_BuildMaskedRanges(*pos_masks,
338                             *seqloc,
339                             *query_seq_id,
340                             0,
341                             & queries[j].mask);
342 
343 	if( queries[0].mask ) {
344         CPacked_seqint::Tdata & seqint_list =
345             queries[0].mask->SetPacked_int().Set();
346 
347         NON_CONST_ITERATE(CPacked_seqint::Tdata, itr, seqint_list) {
348             if ((*itr)->CanGetStrand()) {
349                 switch((*itr)->GetStrand()) {
350                 case eNa_strand_unknown:
351                 case eNa_strand_both:
352                 case eNa_strand_plus:
353                     (*itr)->ResetStrand();
354                     break;
355 
356                 default:
357                     break;
358                 }
359             }
360         }
361 	}
362     }
363 }
364 
365 /// Find the path to the window masker files, first checking the (optionally
366 /// set) value passed to the WindowMaskerPathInit function, then the
367 /// environment variable WINDOW_MASKER_PATH, then the section WINDOW_MASKER,
368 /// label WINDOW_MASKER_PATH in the NCBI configuration file. If not found in
369 /// either location, return the current working directory
370 /// @sa s_FindPathToGeneInfoFiles
371 static string
s_FindPathToWM(void)372 s_FindPathToWM(void)
373 {
374     string retval = WindowMaskerPathGet();
375     if ( !retval.empty() ) {
376         return retval;
377     }
378     const string kEnvVar("WINDOW_MASKER_PATH");
379     const string kSection("WINDOW_MASKER");
380     CNcbiIstrstream empty_stream(kEmptyCStr);
381     CRef<CNcbiRegistry> reg(new CNcbiRegistry(empty_stream,
382                                               IRegistry::fWithNcbirc));
383     CRef<CSimpleEnvRegMapper> mapper(new CSimpleEnvRegMapper(kSection,
384                                                              kEmptyStr));
385     CRef<CEnvironmentRegistry> env_reg(new CEnvironmentRegistry);
386     env_reg->AddMapper(*mapper, CEnvironmentRegistry::ePriority_Max);
387     reg->Add(*env_reg, CNcbiRegistry::ePriority_MaxUser);
388     retval = reg->Get(kSection, kEnvVar);
389     if (retval == kEmptyStr) {
390         retval = CDir::GetCwd();
391     }
392 #if defined(NCBI_OS_MSWIN)
393     // We address this here otherwise CDirEntry::IsAbsolutePath() fails
394     if (NStr::StartsWith(retval, "//")) {
395         NStr::ReplaceInPlace(retval, "//", "\\\\");
396     }
397 #endif
398     return retval;
399 }
400 
WindowMaskerTaxidToDb(const string & window_masker_path,int taxid)401 string WindowMaskerTaxidToDb(const string& window_masker_path, int taxid)
402 {
403     string path = window_masker_path;
404     path += CFile::GetPathSeparator() + NStr::IntToString(taxid)
405         + CFile::GetPathSeparator();
406     const string binpath = path + s_WINDOW_MASKER_STAT_FILE_NAME;
407     return (CFile(binpath).Exists() ? binpath : kEmptyStr);
408 }
409 
410 /* Unit test is in bl2seq_unit_test.cpp */
WindowMaskerTaxidToDb(int taxid)411 string WindowMaskerTaxidToDb(int taxid)
412 {
413     string path = s_FindPathToWM();
414     return WindowMaskerTaxidToDb(path, taxid);
415 }
416 
417 void
Blast_FindWindowMaskerLocTaxId(CBlastQueryVector & queries,int taxid)418 Blast_FindWindowMaskerLocTaxId(CBlastQueryVector & queries, int taxid)
419 {
420     string db = WindowMaskerTaxidToDb(taxid);
421     Blast_FindWindowMaskerLoc(queries, db);
422 }
423 
424 void
Blast_FindWindowMaskerLocTaxId(TSeqLocVector & queries,int taxid)425 Blast_FindWindowMaskerLocTaxId(TSeqLocVector & queries, int taxid)
426 {
427     string db = WindowMaskerTaxidToDb(taxid);
428     Blast_FindWindowMaskerLoc(queries, db);
429 }
430 
s_OldGetTaxIdWithWindowMaskerSupport(set<int> & supported_taxids)431 static void s_OldGetTaxIdWithWindowMaskerSupport(set<int>& supported_taxids)
432 {
433     supported_taxids.clear();
434     CNcbiOstrstream oss;
435     const string wmpath = s_FindPathToWM();
436     oss << wmpath << CFile::GetPathSeparator() << "*"
437         << CFile::GetPathSeparator() << "*.*"
438         << CFile::GetPathSeparator() << s_WINDOW_MASKER_STAT_FILE_NAME;
439     const string path = CNcbiOstrstreamToString(oss);
440 
441     list<string> builds;
442     FindFiles(path, builds, fFF_File);
443     NON_CONST_ITERATE(list<string>, path, builds) {
444         // remove the WindowMasker path and path separator
445         path->erase(0, wmpath.size() + 1);
446         // then remove the remaining path
447         const size_t pos = path->find(CFile::GetPathSeparator());
448         path->erase(pos);
449         const int taxid = NStr::StringToInt(*path, NStr::fConvErr_NoThrow);
450         supported_taxids.insert(taxid);
451     }
452 }
453 
GetTaxIdWithWindowMaskerSupport(set<int> & supported_taxids)454 void GetTaxIdWithWindowMaskerSupport(set<int>& supported_taxids)
455 {
456     supported_taxids.clear();
457     CNcbiOstrstream oss;
458     const string wmpath = s_FindPathToWM();
459     oss << wmpath << CFile::GetPathSeparator() << "*"
460         << CFile::GetPathSeparator() << s_WINDOW_MASKER_STAT_FILE_NAME;
461     const string path = CNcbiOstrstreamToString(oss);
462 
463     list<string> builds;
464     FindFiles(path, builds, fFF_File);
465     NON_CONST_ITERATE(list<string>, path, builds) {
466         // remove the WindowMasker path and path separator
467         path->erase(0, wmpath.size() + 1);
468         // then remove the remaining path
469         const size_t pos = path->find(CFile::GetPathSeparator());
470         path->erase(pos);
471         const int taxid = NStr::StringToInt(*path, NStr::fConvErr_NoThrow);
472         supported_taxids.insert(taxid);
473     }
474 
475     if (supported_taxids.empty()) {
476         s_OldGetTaxIdWithWindowMaskerSupport(supported_taxids);
477     }
478 }
479 
480 END_SCOPE(blast)
481 END_NCBI_SCOPE
482 
483 /* @} */
484