1 /* $Id: win_mask_app.cpp 540711 2017-07-10 17:47:24Z mozese2 $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aleksandr Morgulis
27 *
28 * File Description:
29 * CWinMaskDemoApplication class member and method definitions.
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbidbg.hpp>
35 #include <objtools/readers/fasta.hpp>
36 #include <objects/seqset/Seq_entry.hpp>
37 #include <objects/seq/Bioseq.hpp>
38 #include <objects/seq/Seq_inst.hpp>
39 #include <objects/seq/Seq_data.hpp>
40 #include <objects/seq/seqport_util.hpp>
41 #include <objects/seq/IUPACna.hpp>
42 #include <objects/seqloc/Seq_id.hpp>
43
44 #include <objmgr/bioseq_ci.hpp>
45 #include <objmgr/object_manager.hpp>
46 #include <objmgr/scope.hpp>
47 #include <objmgr/seq_entry_handle.hpp>
48
49 #include "win_mask_app.hpp"
50 #include <algo/winmask/win_mask_config.hpp>
51 #include <algo/winmask/win_mask_reader.hpp>
52 #include <algo/winmask/win_mask_fasta_reader.hpp>
53 #include <algo/winmask/win_mask_writer.hpp>
54 #include <algo/winmask/win_mask_gen_counts.hpp>
55 #include <algo/winmask/win_mask_seq_title.hpp>
56
57 #include <algo/winmask/seq_masker.hpp>
58 #include <algo/winmask/dust_masker.hpp>
59
60 BEGIN_NCBI_SCOPE
61 USING_SCOPE(objects);
62
63 //-------------------------------------------------------------------------
64 const char * const
65 CWinMaskDemoApplication::USAGE_LINE = "Window based sequence masker";
66
67 //-------------------------------------------------------------------------
Init(void)68 void CWinMaskDemoApplication::Init(void)
69 {
70 auto_ptr< CArgDescriptions > arg_desc( new CArgDescriptions );
71
72 // Set the program description
73 arg_desc->SetUsageContext( GetArguments().GetProgramBasename(),
74 USAGE_LINE );
75
76 // Adding command line arguments descriptions
77 arg_desc->AddDefaultKey( "lstat", "length_statistics_file",
78 "relative unit frequencies "
79 "(required if -mk_counts is false)",
80 CArgDescriptions::eString, "" );
81 arg_desc->AddDefaultKey( "input", "input_file_name",
82 "input file name "
83 "(not optional if used with -mk_counts option)",
84 CArgDescriptions::eString, "" );
85 arg_desc->AddDefaultKey( "output", "output_file_name",
86 "output file name",
87 CArgDescriptions::eString, "" );
88 arg_desc->AddDefaultKey( "checkdup", "check_duplicates",
89 "check for duplicate sequences",
90 CArgDescriptions::eBoolean, "false" );
91 arg_desc->AddDefaultKey( "window", "window_size", "window size",
92 CArgDescriptions::eInteger, "19" );
93 arg_desc->AddDefaultKey( "wstep", "window_step", "window step",
94 CArgDescriptions::eInteger, "1" );
95 arg_desc->AddDefaultKey( "ustep", "unit_step", "unit step",
96 CArgDescriptions::eInteger, "1" );
97 arg_desc->AddDefaultKey( "xdrop", "X_drop",
98 "value of X-drop parameter",
99 CArgDescriptions::eInteger, "50" );
100 arg_desc->AddDefaultKey( "score", "score_threshold",
101 "window score threshold",
102 CArgDescriptions::eInteger, "100" );
103 arg_desc->AddDefaultKey( "highscore", "max_score",
104 "maximum useful unit score",
105 CArgDescriptions::eInteger, "500" );
106 arg_desc->AddOptionalKey( "lowscore", "min_score",
107 "minimum useful unit score",
108 CArgDescriptions::eInteger );
109 arg_desc->AddOptionalKey( "sethighscore", "score_value",
110 "alternative high score for a unit if the"
111 "original unit score is more than highscore",
112 CArgDescriptions::eInteger );
113 arg_desc->AddOptionalKey( "setlowscore", "score_value",
114 "alternative low score for a unit if the"
115 "original unit score is lower than lowscore",
116 CArgDescriptions::eInteger );
117 arg_desc->AddDefaultKey( "ambig", "ambiguity_handler",
118 "the way to handle ambiguity characters",
119 CArgDescriptions::eString, "break" );
120 arg_desc->AddDefaultKey( "oformat", "output_format",
121 "controls the format of the masker output",
122 CArgDescriptions::eString, "interval" );
123 arg_desc->AddDefaultKey( "mpass", "merge_pass_flag",
124 "true if separate merging pass is needed",
125 CArgDescriptions::eBoolean, "false" );
126 arg_desc->AddDefaultKey( "discontig", "discontiguous_units",
127 "true if using discontiguous units",
128 CArgDescriptions::eBoolean, "false" );
129 arg_desc->AddDefaultKey( "mscore", "merge_cutoff_score",
130 "minimum average unit score triggering a merge",
131 CArgDescriptions::eInteger, "50" );
132 arg_desc->AddDefaultKey( "mabs", "distance",
133 "absolute distance threshold for merging",
134 CArgDescriptions::eInteger, "8" );
135 arg_desc->AddDefaultKey( "mmean", "distance",
136 "distance threshold for merging if average unit"
137 " score is high enough",
138 CArgDescriptions::eInteger, "50" );
139 arg_desc->AddDefaultKey( "mustep", "merge_unit_step",
140 "unit step value used for interval merging",
141 CArgDescriptions::eInteger, "1" );
142 arg_desc->AddDefaultKey( "trigger", "trigger_type",
143 "type of the event triggering masking",
144 CArgDescriptions::eString, "mean" );
145 arg_desc->AddDefaultKey( "tmin_count", "unit_count",
146 "number of units to count with min trigger",
147 CArgDescriptions::eInteger, "0" );
148 arg_desc->AddDefaultKey( "pattern", "base_mask",
149 "which bases in a window to use as a discontinuous unit",
150 CArgDescriptions::eInteger, "0" );
151 arg_desc->AddDefaultKey( "dbg", "debug_output",
152 "enable debug output",
153 CArgDescriptions::eBoolean, "false" );
154 arg_desc->AddDefaultKey( "mk_counts", "generate_counts",
155 "generate frequency counts for a database",
156 CArgDescriptions::eBoolean, "false" );
157 arg_desc->AddDefaultKey( "fa_list", "input_is_a_list",
158 "indicates that -input represents a file containing "
159 "a list of names of fasta files to process, one name "
160 " per line (can only be used with -mk_counts true)",
161 CArgDescriptions::eBoolean, "false" );
162 arg_desc->AddDefaultKey( "mem", "available_memory",
163 "memory available for mk_counts option in megabytes",
164 CArgDescriptions::eInteger, "1536" );
165 arg_desc->AddDefaultKey( "unit", "unit_length",
166 "number of bases in a unit",
167 CArgDescriptions::eInteger, "15" );
168 arg_desc->AddDefaultKey( "th", "thresholds",
169 "4 percentage values used to determine "
170 "masking thresholds (4 floating point numbers "
171 "separated by commas)",
172 CArgDescriptions::eString, "90,99,99.5,99.8" );
173 arg_desc->AddDefaultKey( "dust", "use_dust",
174 "combine window masking with dusting",
175 CArgDescriptions::eBoolean, "F" );
176 arg_desc->AddDefaultKey( "dust_window", "dust_window",
177 "window size for dusting",
178 CArgDescriptions::eInteger, "64" );
179 arg_desc->AddDefaultKey( "dust_level", "dust_level",
180 "dust minimum level",
181 CArgDescriptions::eInteger, "20" );
182 arg_desc->AddDefaultKey( "dust_linker", "dust_linker",
183 "link windows by this many basepairs",
184 CArgDescriptions::eInteger, "1" );
185 arg_desc->AddDefaultKey( "exclude_ids", "exclude_id_list",
186 "file containing the list of ids to exclude from processing",
187 CArgDescriptions::eString, "" );
188 arg_desc->AddDefaultKey( "ids", "id_list",
189 "file containing the list of ids to process",
190 CArgDescriptions::eString, "" );
191
192 // Set some constraints on command line parameters
193 arg_desc->SetConstraint( "window",
194 new CArgAllow_Integers( 1, kMax_Int ) );
195 arg_desc->SetConstraint( "wstep",
196 new CArgAllow_Integers( 1, kMax_Int ) );
197 arg_desc->SetConstraint( "ustep",
198 new CArgAllow_Integers( 1, 256 ) );
199 arg_desc->SetConstraint( "xdrop",
200 new CArgAllow_Integers( 0, kMax_Int ) );
201 arg_desc->SetConstraint( "score",
202 new CArgAllow_Integers( 1, kMax_Int ) );
203 arg_desc->SetConstraint( "highscore",
204 new CArgAllow_Integers( 1, kMax_Int ) );
205 arg_desc->SetConstraint( "lowscore",
206 new CArgAllow_Integers( 1, kMax_Int ) );
207 arg_desc->SetConstraint( "sethighscore",
208 new CArgAllow_Integers( 1, kMax_Int ) );
209 arg_desc->SetConstraint( "setlowscore",
210 new CArgAllow_Integers( 1, kMax_Int ) );
211 arg_desc->SetConstraint( "mscore",
212 new CArgAllow_Integers( 0, kMax_Int ) );
213 arg_desc->SetConstraint( "mabs",
214 new CArgAllow_Integers( 0, kMax_Int ) );
215 arg_desc->SetConstraint( "mmean",
216 new CArgAllow_Integers( 0, kMax_Int ) );
217 arg_desc->SetConstraint( "mustep",
218 new CArgAllow_Integers( 0, 256 ) );
219 arg_desc->SetConstraint( "ambig",
220 (new CArgAllow_Strings())->Allow( "break" ) );
221 arg_desc->SetConstraint( "oformat",
222 (new CArgAllow_Strings())->Allow( "interval" )
223 ->Allow( "fasta" ) );
224 arg_desc->SetConstraint( "trigger",
225 (new CArgAllow_Strings())->Allow( "mean" )
226 ->Allow( "min" ) );
227 arg_desc->SetConstraint( "tmin_count",
228 new CArgAllow_Integers( 0, kMax_Int ) );
229 arg_desc->SetConstraint( "mem", new CArgAllow_Integers( 1, kMax_Int ) );
230 arg_desc->SetConstraint( "unit", new CArgAllow_Integers( 1, 16 ) );
231
232 // Parse the arguments according to descriptions.
233 SetupArgDescriptions(arg_desc.release());
234 }
235
236 //-------------------------------------------------------------------------
Run(void)237 int CWinMaskDemoApplication::Run (void)
238 {
239 CRef<CObjectManager> om(CObjectManager::GetInstance());
240
241 if( GetArgs()["dbg"].AsBoolean() )
242 SetDiagTrace( eDT_Enable );
243
244 // Read and validate configuration values.
245 CWinMaskConfig aConfig( GetArgs() );
246 aConfig.Validate();
247
248 if( aConfig.MakeCounts() )
249 {
250 CWinMaskCountsGenerator cg( aConfig.Input(),
251 aConfig.Output(),
252 aConfig.Th(),
253 aConfig.Mem(),
254 aConfig.UnitSize(),
255 aConfig.MinScore(),
256 aConfig.MaxScore(),
257 aConfig.HasMinScore(),
258 aConfig.CheckDup(),
259 aConfig.FaList() );
260 cg();
261 return 0;
262 }
263
264 CWinMaskReader & theReader = aConfig.Reader();
265 CWinMaskWriter & theWriter = aConfig.Writer();
266 CSeqMasker theMasker( aConfig.LStatName(),
267 aConfig.WindowSize(),
268 aConfig.WindowStep(),
269 aConfig.UnitStep(),
270 aConfig.XDrop(),
271 aConfig.CutoffScore(),
272 aConfig.MaxScore(),
273 aConfig.MinScore(),
274 aConfig.SetMaxScore(),
275 aConfig.SetMinScore(),
276 aConfig.MergePass(),
277 aConfig.MergeCutoffScore(),
278 aConfig.AbsMergeCutoffDist(),
279 aConfig.MeanMergeCutoffDist(),
280 aConfig.MergeUnitStep(),
281 aConfig.Trigger(),
282 aConfig.TMin_Count(),
283 aConfig.Discontig(),
284 aConfig.Pattern() );
285 CRef< CSeq_entry > aSeqEntry( 0 );
286 Uint4 total = 0, total_masked = 0;
287 CDustMasker * duster( 0 );
288 set< string > ids( aConfig.Ids() );
289 set< string > exclude_ids( aConfig.ExcludeIds() );
290
291 if( aConfig.UseDust() )
292 duster = new CDustMasker( aConfig.DustWindow(),
293 aConfig.DustLevel(),
294 aConfig.DustLinker() );
295
296 while( (aSeqEntry = theReader.GetNextSequence()).NotEmpty() )
297 {
298 Uint4 masked = 0;
299 const CBioseq & bioseq = aSeqEntry->GetSeq();
300
301 if( bioseq.CanGetInst()
302 && bioseq.GetInst().CanGetLength()
303 && bioseq.GetInst().CanGetSeq_data() )
304 {
305 CRef<CScope> scope(new CScope(*om));
306 CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry( *aSeqEntry );
307 bool process( true );
308 string id( CWinMaskSeqTitle::GetId( seh, bioseq ) );
309
310 if( !ids.empty() )
311 {
312 process = false;
313
314 if( ids.find( id ) != ids.end() )
315 process = true;
316 }
317
318 if( !exclude_ids.empty() )
319 if( exclude_ids.find( id ) != exclude_ids.end() )
320 process = false;
321
322 if( process )
323 {
324 TSeqPos len = bioseq.GetInst().GetLength();
325 total += len;
326 _TRACE( "Sequence length " << len );
327 const CSeq_data & seqdata = bioseq.GetInst().GetSeq_data();
328 CRef< CSeq_data > dest( new CSeq_data );
329 CSeqportUtil::Convert( seqdata, dest, CSeq_data::e_Iupacna,
330 0, len );
331 const string & data = dest->GetIupacna().Get();
332 auto_ptr< CSeqMasker::TMaskList > mask_info( theMasker( data ) );
333
334 if( duster != 0 ) // Dust and merge with mask_info
335 {
336 auto_ptr< CSeqMasker::TMaskList > dust_info( (*duster)( data ) );
337 CSeqMasker::MergeMaskInfo( mask_info.get(), dust_info.get() );
338 }
339
340 theWriter.Print( seh, bioseq, *mask_info );
341
342 for( CSeqMasker::TMaskList::const_iterator i = mask_info->begin();
343 i != mask_info->end(); ++i )
344 masked += i->second - i->first + 1;
345
346 total_masked += masked;
347 _TRACE( "Number of positions masked: " << masked );
348 }
349 }
350 }
351
352 _TRACE( "Total number of positions: " << total );
353 _TRACE( "Total number of positions masked: " << total_masked );
354 return 0;
355 }
356
357 END_NCBI_SCOPE
358