1 /*  $Id: win_mask_app.cpp 540711 2017-07-10 17:47:24Z mozese2 $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Aleksandr Morgulis
27  *
28  * File Description:
29  *   CWinMaskDemoApplication class member and method definitions.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbidbg.hpp>
35 #include <objtools/readers/fasta.hpp>
36 #include <objects/seqset/Seq_entry.hpp>
37 #include <objects/seq/Bioseq.hpp>
38 #include <objects/seq/Seq_inst.hpp>
39 #include <objects/seq/Seq_data.hpp>
40 #include <objects/seq/seqport_util.hpp>
41 #include <objects/seq/IUPACna.hpp>
42 #include <objects/seqloc/Seq_id.hpp>
43 
44 #include <objmgr/bioseq_ci.hpp>
45 #include <objmgr/object_manager.hpp>
46 #include <objmgr/scope.hpp>
47 #include <objmgr/seq_entry_handle.hpp>
48 
49 #include "win_mask_app.hpp"
50 #include <algo/winmask/win_mask_config.hpp>
51 #include <algo/winmask/win_mask_reader.hpp>
52 #include <algo/winmask/win_mask_fasta_reader.hpp>
53 #include <algo/winmask/win_mask_writer.hpp>
54 #include <algo/winmask/win_mask_gen_counts.hpp>
55 #include <algo/winmask/win_mask_seq_title.hpp>
56 
57 #include <algo/winmask/seq_masker.hpp>
58 #include <algo/winmask/dust_masker.hpp>
59 
60 BEGIN_NCBI_SCOPE
61 USING_SCOPE(objects);
62 
63 //-------------------------------------------------------------------------
64 const char * const
65 CWinMaskDemoApplication::USAGE_LINE = "Window based sequence masker";
66 
67 //-------------------------------------------------------------------------
Init(void)68 void CWinMaskDemoApplication::Init(void)
69 {
70     auto_ptr< CArgDescriptions > arg_desc( new CArgDescriptions );
71 
72     // Set the program description
73     arg_desc->SetUsageContext( GetArguments().GetProgramBasename(),
74                                USAGE_LINE );
75 
76     // Adding command line arguments descriptions
77     arg_desc->AddDefaultKey( "lstat", "length_statistics_file",
78                              "relative unit frequencies "
79                              "(required if -mk_counts is false)",
80                              CArgDescriptions::eString, "" );
81     arg_desc->AddDefaultKey( "input", "input_file_name",
82                              "input file name "
83                              "(not optional if used with -mk_counts option)",
84                              CArgDescriptions::eString, "" );
85     arg_desc->AddDefaultKey( "output", "output_file_name",
86                              "output file name",
87                              CArgDescriptions::eString, "" );
88     arg_desc->AddDefaultKey( "checkdup", "check_duplicates",
89                              "check for duplicate sequences",
90                              CArgDescriptions::eBoolean, "false" );
91     arg_desc->AddDefaultKey( "window", "window_size", "window size",
92                              CArgDescriptions::eInteger, "19" );
93     arg_desc->AddDefaultKey( "wstep", "window_step", "window step",
94                              CArgDescriptions::eInteger, "1" );
95     arg_desc->AddDefaultKey( "ustep", "unit_step", "unit step",
96                              CArgDescriptions::eInteger, "1" );
97     arg_desc->AddDefaultKey( "xdrop", "X_drop",
98                              "value of X-drop parameter",
99                              CArgDescriptions::eInteger, "50" );
100     arg_desc->AddDefaultKey( "score", "score_threshold",
101                              "window score threshold",
102                              CArgDescriptions::eInteger, "100" );
103     arg_desc->AddDefaultKey( "highscore", "max_score",
104                              "maximum useful unit score",
105                              CArgDescriptions::eInteger, "500" );
106     arg_desc->AddOptionalKey( "lowscore", "min_score",
107                               "minimum useful unit score",
108                               CArgDescriptions::eInteger );
109     arg_desc->AddOptionalKey( "sethighscore", "score_value",
110                               "alternative high score for a unit if the"
111                               "original unit score is more than highscore",
112                               CArgDescriptions::eInteger );
113     arg_desc->AddOptionalKey( "setlowscore", "score_value",
114                               "alternative low score for a unit if the"
115                               "original unit score is lower than lowscore",
116                               CArgDescriptions::eInteger );
117     arg_desc->AddDefaultKey( "ambig", "ambiguity_handler",
118                              "the way to handle ambiguity characters",
119                              CArgDescriptions::eString, "break" );
120     arg_desc->AddDefaultKey( "oformat", "output_format",
121                              "controls the format of the masker output",
122                              CArgDescriptions::eString, "interval" );
123     arg_desc->AddDefaultKey( "mpass", "merge_pass_flag",
124                              "true if separate merging pass is needed",
125                              CArgDescriptions::eBoolean, "false" );
126     arg_desc->AddDefaultKey( "discontig", "discontiguous_units",
127                              "true if using discontiguous units",
128                              CArgDescriptions::eBoolean, "false" );
129     arg_desc->AddDefaultKey( "mscore", "merge_cutoff_score",
130                              "minimum average unit score triggering a merge",
131                              CArgDescriptions::eInteger, "50" );
132     arg_desc->AddDefaultKey( "mabs", "distance",
133                              "absolute distance threshold for merging",
134                              CArgDescriptions::eInteger, "8" );
135     arg_desc->AddDefaultKey( "mmean", "distance",
136                              "distance threshold for merging if average unit"
137                              " score is high enough",
138                              CArgDescriptions::eInteger, "50" );
139     arg_desc->AddDefaultKey( "mustep", "merge_unit_step",
140                              "unit step value used for interval merging",
141                              CArgDescriptions::eInteger, "1" );
142     arg_desc->AddDefaultKey( "trigger", "trigger_type",
143                              "type of the event triggering masking",
144                              CArgDescriptions::eString, "mean" );
145     arg_desc->AddDefaultKey( "tmin_count", "unit_count",
146                              "number of units to count with min trigger",
147                              CArgDescriptions::eInteger, "0" );
148     arg_desc->AddDefaultKey( "pattern", "base_mask",
149                              "which bases in a window to use as a discontinuous unit",
150                              CArgDescriptions::eInteger, "0" );
151     arg_desc->AddDefaultKey( "dbg", "debug_output",
152                              "enable debug output",
153                              CArgDescriptions::eBoolean, "false" );
154     arg_desc->AddDefaultKey( "mk_counts", "generate_counts",
155                              "generate frequency counts for a database",
156                              CArgDescriptions::eBoolean, "false" );
157     arg_desc->AddDefaultKey( "fa_list", "input_is_a_list",
158                              "indicates that -input represents a file containing "
159                              "a list of names of fasta files to process, one name "
160                              " per line (can only be used with -mk_counts true)",
161                              CArgDescriptions::eBoolean, "false" );
162     arg_desc->AddDefaultKey( "mem", "available_memory",
163                              "memory available for mk_counts option in megabytes",
164                              CArgDescriptions::eInteger, "1536" );
165     arg_desc->AddDefaultKey( "unit", "unit_length",
166                              "number of bases in a unit",
167                              CArgDescriptions::eInteger, "15" );
168     arg_desc->AddDefaultKey( "th", "thresholds",
169                              "4 percentage values used to determine "
170                              "masking thresholds (4 floating point numbers "
171                              "separated by commas)",
172                              CArgDescriptions::eString, "90,99,99.5,99.8" );
173     arg_desc->AddDefaultKey( "dust", "use_dust",
174                              "combine window masking with dusting",
175                              CArgDescriptions::eBoolean, "F" );
176     arg_desc->AddDefaultKey( "dust_window", "dust_window",
177                              "window size for dusting",
178                              CArgDescriptions::eInteger, "64" );
179     arg_desc->AddDefaultKey( "dust_level", "dust_level",
180                              "dust minimum level",
181                              CArgDescriptions::eInteger, "20" );
182     arg_desc->AddDefaultKey( "dust_linker", "dust_linker",
183                              "link windows by this many basepairs",
184                              CArgDescriptions::eInteger, "1" );
185     arg_desc->AddDefaultKey( "exclude_ids", "exclude_id_list",
186                              "file containing the list of ids to exclude from processing",
187                              CArgDescriptions::eString, "" );
188     arg_desc->AddDefaultKey( "ids", "id_list",
189                              "file containing the list of ids to process",
190                              CArgDescriptions::eString, "" );
191 
192     // Set some constraints on command line parameters
193     arg_desc->SetConstraint( "window",
194                              new CArgAllow_Integers( 1, kMax_Int ) );
195     arg_desc->SetConstraint( "wstep",
196                              new CArgAllow_Integers( 1, kMax_Int ) );
197     arg_desc->SetConstraint( "ustep",
198                              new CArgAllow_Integers( 1, 256 ) );
199     arg_desc->SetConstraint( "xdrop",
200                              new CArgAllow_Integers( 0, kMax_Int ) );
201     arg_desc->SetConstraint( "score",
202                              new CArgAllow_Integers( 1, kMax_Int ) );
203     arg_desc->SetConstraint( "highscore",
204                              new CArgAllow_Integers( 1, kMax_Int ) );
205     arg_desc->SetConstraint( "lowscore",
206                              new CArgAllow_Integers( 1, kMax_Int ) );
207     arg_desc->SetConstraint( "sethighscore",
208                              new CArgAllow_Integers( 1, kMax_Int ) );
209     arg_desc->SetConstraint( "setlowscore",
210                              new CArgAllow_Integers( 1, kMax_Int ) );
211     arg_desc->SetConstraint( "mscore",
212                              new CArgAllow_Integers( 0, kMax_Int ) );
213     arg_desc->SetConstraint( "mabs",
214                              new CArgAllow_Integers( 0, kMax_Int ) );
215     arg_desc->SetConstraint( "mmean",
216                              new CArgAllow_Integers( 0, kMax_Int ) );
217     arg_desc->SetConstraint( "mustep",
218                              new CArgAllow_Integers( 0, 256 ) );
219     arg_desc->SetConstraint( "ambig",
220                              (new CArgAllow_Strings())->Allow( "break" ) );
221     arg_desc->SetConstraint( "oformat",
222                              (new CArgAllow_Strings())->Allow( "interval" )
223                              ->Allow( "fasta" ) );
224     arg_desc->SetConstraint( "trigger",
225                              (new CArgAllow_Strings())->Allow( "mean" )
226                              ->Allow( "min" ) );
227     arg_desc->SetConstraint( "tmin_count",
228                              new CArgAllow_Integers( 0, kMax_Int ) );
229     arg_desc->SetConstraint( "mem", new CArgAllow_Integers( 1, kMax_Int ) );
230     arg_desc->SetConstraint( "unit", new CArgAllow_Integers( 1, 16 ) );
231 
232     // Parse the arguments according to descriptions.
233     SetupArgDescriptions(arg_desc.release());
234 }
235 
236 //-------------------------------------------------------------------------
Run(void)237 int CWinMaskDemoApplication::Run (void)
238 {
239     CRef<CObjectManager> om(CObjectManager::GetInstance());
240 
241     if( GetArgs()["dbg"].AsBoolean() )
242         SetDiagTrace( eDT_Enable );
243 
244     // Read and validate configuration values.
245     CWinMaskConfig aConfig( GetArgs() );
246     aConfig.Validate();
247 
248     if( aConfig.MakeCounts() )
249     {
250         CWinMaskCountsGenerator cg( aConfig.Input(),
251                                     aConfig.Output(),
252                                     aConfig.Th(),
253                                     aConfig.Mem(),
254                                     aConfig.UnitSize(),
255                                     aConfig.MinScore(),
256                                     aConfig.MaxScore(),
257                                     aConfig.HasMinScore(),
258                                     aConfig.CheckDup(),
259                                     aConfig.FaList() );
260         cg();
261         return 0;
262     }
263 
264     CWinMaskReader & theReader = aConfig.Reader();
265     CWinMaskWriter & theWriter = aConfig.Writer();
266     CSeqMasker theMasker( aConfig.LStatName(),
267                           aConfig.WindowSize(),
268                           aConfig.WindowStep(),
269                           aConfig.UnitStep(),
270                           aConfig.XDrop(),
271                           aConfig.CutoffScore(),
272                           aConfig.MaxScore(),
273                           aConfig.MinScore(),
274                           aConfig.SetMaxScore(),
275                           aConfig.SetMinScore(),
276                           aConfig.MergePass(),
277                           aConfig.MergeCutoffScore(),
278                           aConfig.AbsMergeCutoffDist(),
279                           aConfig.MeanMergeCutoffDist(),
280                           aConfig.MergeUnitStep(),
281                           aConfig.Trigger(),
282                           aConfig.TMin_Count(),
283                           aConfig.Discontig(),
284                           aConfig.Pattern() );
285     CRef< CSeq_entry > aSeqEntry( 0 );
286     Uint4 total = 0, total_masked = 0;
287     CDustMasker * duster( 0 );
288     set< string > ids( aConfig.Ids() );
289     set< string > exclude_ids( aConfig.ExcludeIds() );
290 
291     if( aConfig.UseDust() )
292         duster = new CDustMasker( aConfig.DustWindow(),
293                                   aConfig.DustLevel(),
294                                   aConfig.DustLinker() );
295 
296     while( (aSeqEntry = theReader.GetNextSequence()).NotEmpty() )
297     {
298         Uint4 masked = 0;
299         const CBioseq & bioseq = aSeqEntry->GetSeq();
300 
301         if(    bioseq.CanGetInst()
302                && bioseq.GetInst().CanGetLength()
303                && bioseq.GetInst().CanGetSeq_data() )
304         {
305             CRef<CScope> scope(new CScope(*om));
306             CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry( *aSeqEntry );
307             bool process( true );
308             string id( CWinMaskSeqTitle::GetId( seh, bioseq ) );
309 
310             if( !ids.empty() )
311             {
312                 process = false;
313 
314                 if( ids.find( id ) != ids.end() )
315                     process = true;
316             }
317 
318             if( !exclude_ids.empty() )
319                 if( exclude_ids.find( id ) != exclude_ids.end() )
320                     process = false;
321 
322             if( process )
323             {
324                 TSeqPos len = bioseq.GetInst().GetLength();
325                 total += len;
326                 _TRACE( "Sequence length " << len );
327                 const CSeq_data & seqdata = bioseq.GetInst().GetSeq_data();
328                 CRef< CSeq_data > dest( new CSeq_data );
329                 CSeqportUtil::Convert( seqdata, dest, CSeq_data::e_Iupacna,
330                                        0, len );
331                 const string & data = dest->GetIupacna().Get();
332                 auto_ptr< CSeqMasker::TMaskList > mask_info( theMasker( data ) );
333 
334                 if( duster != 0 ) // Dust and merge with mask_info
335                 {
336                     auto_ptr< CSeqMasker::TMaskList > dust_info( (*duster)( data ) );
337                     CSeqMasker::MergeMaskInfo( mask_info.get(), dust_info.get() );
338                 }
339 
340                 theWriter.Print( seh, bioseq, *mask_info );
341 
342                 for( CSeqMasker::TMaskList::const_iterator i = mask_info->begin();
343                      i != mask_info->end(); ++i )
344                     masked += i->second - i->first + 1;
345 
346                 total_masked += masked;
347                 _TRACE( "Number of positions masked: " << masked );
348             }
349         }
350     }
351 
352     _TRACE( "Total number of positions: " << total );
353     _TRACE( "Total number of positions masked: " << total_masked );
354     return 0;
355 }
356 
357 END_NCBI_SCOPE
358