1 /* # $Id: convert2blastmask.cpp 611888 2020-07-13 11:50:02Z fongah2 $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Ning Ma
27 *
28 */
29
30 /** @file convert2blastmask.cpp
31 * extracts mask info from lower case masked FASTA file in ASN or XML formats
32 */
33
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbiapp.hpp>
36 #include <objtools/readers/fasta.hpp>
37 #include <objtools/seqmasks_io/mask_writer_blastdb_maskinfo.hpp>
38
39 #include "../blast/blast_app_util.hpp"
40
41 #ifndef SKIP_DOXYGEN_PROCESSING
42 USING_NCBI_SCOPE;
43 USING_SCOPE(objects);
44 USING_SCOPE(blast);
45 #endif /* SKIP_DOXYGEN_PROCESSING */
46
47 //-----------------------------------------------------------
48 // A hacked fasta file reader that extracts mask info
49 class CMaskFromFasta : public CFastaReader {
50 private:
51 bool m_hasMask;
52 CMaskWriter::TMaskList m_mask;
53 TSeqPos m_from;
54
55 public:
CMaskFromFasta(CNcbiIstream & input,bool parse_seqids)56 CMaskFromFasta(CNcbiIstream & input, bool parse_seqids)
57 : CFastaReader(input, (parse_seqids ? 0 : CFastaReader::fNoParseID)) {}
58
HasMask() const59 bool HasMask() const {
60 return m_hasMask;
61 }
62
GetMask() const63 const CMaskWriter::TMaskList & GetMask() const {
64 return m_mask;
65 }
66
GetNextSequence()67 bool GetNextSequence() {
68 m_hasMask = false;
69 m_mask.clear();
70 if (AtEOF()) return false;
71 SaveMask();
72 ReadOneSeq();
73 return true;
74 }
75
76 // hack to deal with interval format
ParseDataLine(const TStr & s,ILineErrorListener * pMessageListener)77 virtual void ParseDataLine(const TStr &s,
78 ILineErrorListener * pMessageListener)
79 {
80 if (s[0] >= '0' && s[0] <= '9' && s.find('-') > 0) {
81 string s1, s2;
82 NStr::SplitInTwo(s,"-",s1,s2);
83 m_hasMask = true;
84 m_mask.push_back(CMaskWriter::TMaskedInterval(
85 NStr::StringToUInt(NStr::TruncateSpaces(s1)),
86 NStr::StringToUInt(NStr::TruncateSpaces(s2))));
87 // fake a sequence data to make CFastaReader happy
88 CFastaReader::ParseDataLine("A", pMessageListener);
89 } else {
90 CFastaReader::ParseDataLine(s, pMessageListener);
91 }
92 }
93
94 // hack to deal with fasta format
x_OpenMask(void)95 virtual void x_OpenMask(void) {
96 CFastaReader::x_OpenMask();
97 m_from = GetCurrentPos(ePosWithGapsAndSegs);
98 }
99
x_CloseMask(void)100 virtual void x_CloseMask(void) {
101 CFastaReader::x_CloseMask();
102 m_hasMask = true;
103 m_mask.push_back(CMaskWriter::TMaskedInterval(m_from,
104 GetCurrentPos(ePosWithGapsAndSegs)-1));
105 }
106 };
107
108
109 class CConvert2BlastMaskApplication : public CNcbiApplication {
110 public:
CConvert2BlastMaskApplication()111 CConvert2BlastMaskApplication() {
112 CRef<CVersion> version(new CVersion());
113 version->SetVersionInfo(new CBlastVersion());
114 SetFullVersion(version);
115 m_StopWatch.Start();
116 if (m_UsageReport.IsEnabled()) {
117 m_UsageReport.AddParam(CBlastUsageReport::eVersion, GetVersion().Print());
118 m_UsageReport.AddParam(CBlastUsageReport::eProgram, (string) "convert2blastmask");
119 }
120 }
~CConvert2BlastMaskApplication()121 ~CConvert2BlastMaskApplication() {
122 m_UsageReport.AddParam(CBlastUsageReport::eRunTime, m_StopWatch.Elapsed());
123 }
124
125 private:
126 virtual void Init(void);
127 virtual int Run(void);
128 virtual void Exit(void);
129
130 CMaskFromFasta* x_GetReader();
131 CMaskWriterBlastDbMaskInfo* x_GetWriter();
132
133 void x_AddCmdOptions();
134
135 /// Contains the description of this application
136 static const char * const USAGE_LINE;
137 CBlastUsageReport m_UsageReport;
138 CStopWatch m_StopWatch;
139 };
140
141 const char * const CConvert2BlastMaskApplication::USAGE_LINE
142 = "Convert masking information in lower-case masked FASTA input to file formats suitable for makeblastdb";
143
Init(void)144 void CConvert2BlastMaskApplication::Init(void) {
145 HideStdArgs(fHideLogfile | fHideConffile | fHideFullVersion | fHideXmlHelp | fHideDryRun);
146
147 // Create command-line argument descriptions class
148 auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
149
150 // Specify USAGE context
151 arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
152 USAGE_LINE);
153
154 arg_desc->AddDefaultKey("in", "input_file_name",
155 "Input file name",
156 CArgDescriptions::eInputFile, "-");
157
158 arg_desc->AddDefaultKey("out", "output_file_name",
159 "Output file name",
160 CArgDescriptions::eOutputFile, "-");
161
162 arg_desc->AddDefaultKey("outfmt", "output_format",
163 "Output file format",
164 CArgDescriptions::eString, "maskinfo_asn1_text");
165
166 CArgAllow_Strings* strings_allowed = new CArgAllow_Strings();
167 strings_allowed->Allow("maskinfo_asn1_text");
168 strings_allowed->Allow("maskinfo_asn1_bin");
169 strings_allowed->Allow("maskinfo_xml");
170 strings_allowed->Allow("interval");
171 arg_desc->SetConstraint("outfmt", strings_allowed);
172
173 arg_desc->AddFlag ( "parse_seqids",
174 "Parse Seq-ids in FASTA input", true );
175
176 arg_desc->AddKey ("masking_algorithm", "mask_program_name",
177 "Masking algorithm name (e.g.: dust, seg, "
178 "windowmasker, repeat). Use 'other' for "
179 "user-defined type",
180 CArgDescriptions::eString);
181
182 arg_desc->AddKey ("masking_options", "mask_program_options",
183 "Masking algorithm options to create the masked input"
184 " (free text to describe/include (command line) "
185 "options used to create the masking)",
186 CArgDescriptions::eString);
187
188 // Setup arg.descriptions for this application
189 SetupArgDescriptions(arg_desc.release());
190 }
191
192 CMaskFromFasta*
x_GetReader()193 CConvert2BlastMaskApplication::x_GetReader() {
194 const CArgs& args = GetArgs();
195 CNcbiIstream& input = args["in"].AsInputFile();
196 return(new CMaskFromFasta(input, args["parse_seqids"]));
197 }
198
199 CMaskWriterBlastDbMaskInfo*
x_GetWriter()200 CConvert2BlastMaskApplication::x_GetWriter() {
201 const CArgs& args = GetArgs();
202 const string& format(args["outfmt"].AsString());
203 CNcbiOstream& output = args["out"].AsOutputFile();
204
205 string algo=args["masking_algorithm"].AsString();
206 NStr::ToLower(algo);
207 EBlast_filter_program prog;
208 if (algo == "not_set" ) prog = eBlast_filter_program_not_set;
209 else if(algo == "dust" ) prog = eBlast_filter_program_dust;
210 else if(algo == "seg" ) prog = eBlast_filter_program_seg;
211 else if(algo == "windowmasker") prog = eBlast_filter_program_windowmasker;
212 else if(algo == "repeat" ) prog = eBlast_filter_program_repeat;
213 else prog = eBlast_filter_program_other;
214
215 return(new CMaskWriterBlastDbMaskInfo(output, format, 0,
216 prog, args["masking_options"].AsString()));
217 }
218
Run(void)219 int CConvert2BlastMaskApplication::Run(void) {
220 int retval = 0;
221
222 try {
223 auto_ptr<CMaskFromFasta> reader(x_GetReader());
224 auto_ptr<CMaskWriterBlastDbMaskInfo> writer(x_GetWriter());
225
226 while (reader->GetNextSequence()) {
227 if(reader->HasMask()) writer->Print(reader->GetBestID(), reader->GetMask());
228 }
229 } catch (const CException& e) {
230 cerr << e.what() << endl;
231 retval = 1;
232 }
233 x_AddCmdOptions();
234 m_UsageReport.AddParam(CBlastUsageReport::eExitStatus, retval);
235 return retval;
236 }
237
Exit(void)238 void CConvert2BlastMaskApplication::Exit(void)
239 {
240 SetDiagStream(0);
241 }
242
x_AddCmdOptions()243 void CConvert2BlastMaskApplication::x_AddCmdOptions()
244 {
245 const CArgs & args = GetArgs();
246 if (args["masking_algorithm"].HasValue()) {
247 m_UsageReport.AddParam(CBlastUsageReport::eMaskAlgo, args["masking_algorithm"].AsString());
248 }
249 if (args["outfmt"].HasValue()) {
250 m_UsageReport.AddParam(CBlastUsageReport::eOutputFmt, args["outfmt"].AsString());
251 }
252 if (args["parse_seqids"].HasValue()) {
253 m_UsageReport.AddParam(CBlastUsageReport::eParseSeqIDs, true);
254 }
255
256 }
257
258 #ifndef SKIP_DOXYGEN_PROCESSING
main(int argc,const char * argv[])259 int main(int argc, const char* argv[])
260 {
261 // Execute main application function
262 return CConvert2BlastMaskApplication().AppMain(argc, argv);
263 }
264 #endif /* SKIP_DOXYGEN_PROCESSING */
265
266