1 /*  # $Id: convert2blastmask.cpp 611888 2020-07-13 11:50:02Z fongah2 $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Ning Ma
27  *
28  */
29 
30 /** @file convert2blastmask.cpp
31  * extracts mask info from lower case masked FASTA file in ASN or XML formats
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbiapp.hpp>
36 #include <objtools/readers/fasta.hpp>
37 #include <objtools/seqmasks_io/mask_writer_blastdb_maskinfo.hpp>
38 
39 #include "../blast/blast_app_util.hpp"
40 
41 #ifndef SKIP_DOXYGEN_PROCESSING
42 USING_NCBI_SCOPE;
43 USING_SCOPE(objects);
44 USING_SCOPE(blast);
45 #endif /* SKIP_DOXYGEN_PROCESSING */
46 
47 //-----------------------------------------------------------
48 // A hacked fasta file reader that extracts mask info
49 class CMaskFromFasta : public CFastaReader {
50 private:
51     bool m_hasMask;
52     CMaskWriter::TMaskList m_mask;
53     TSeqPos m_from;
54 
55 public:
CMaskFromFasta(CNcbiIstream & input,bool parse_seqids)56     CMaskFromFasta(CNcbiIstream & input, bool parse_seqids)
57         : CFastaReader(input, (parse_seqids ? 0 : CFastaReader::fNoParseID)) {}
58 
HasMask() const59     bool HasMask() const {
60         return m_hasMask;
61     }
62 
GetMask() const63     const CMaskWriter::TMaskList & GetMask() const {
64         return m_mask;
65     }
66 
GetNextSequence()67     bool GetNextSequence() {
68         m_hasMask = false;
69         m_mask.clear();
70         if (AtEOF()) return false;
71         SaveMask();
72         ReadOneSeq();
73         return true;
74     }
75 
76     // hack to deal with interval format
ParseDataLine(const TStr & s,ILineErrorListener * pMessageListener)77     virtual void ParseDataLine(const TStr &s,
78         ILineErrorListener * pMessageListener)
79     {
80         if (s[0] >= '0' && s[0] <= '9' && s.find('-') > 0) {
81             string s1, s2;
82             NStr::SplitInTwo(s,"-",s1,s2);
83             m_hasMask = true;
84             m_mask.push_back(CMaskWriter::TMaskedInterval(
85                                  NStr::StringToUInt(NStr::TruncateSpaces(s1)),
86                                  NStr::StringToUInt(NStr::TruncateSpaces(s2))));
87             // fake a sequence data to make CFastaReader happy
88             CFastaReader::ParseDataLine("A", pMessageListener);
89         } else {
90             CFastaReader::ParseDataLine(s, pMessageListener);
91         }
92     }
93 
94     // hack to deal with fasta format
x_OpenMask(void)95     virtual void x_OpenMask(void) {
96         CFastaReader::x_OpenMask();
97         m_from = GetCurrentPos(ePosWithGapsAndSegs);
98     }
99 
x_CloseMask(void)100     virtual void x_CloseMask(void) {
101         CFastaReader::x_CloseMask();
102         m_hasMask = true;
103         m_mask.push_back(CMaskWriter::TMaskedInterval(m_from,
104                                  GetCurrentPos(ePosWithGapsAndSegs)-1));
105     }
106 };
107 
108 
109 class CConvert2BlastMaskApplication : public CNcbiApplication {
110 public:
CConvert2BlastMaskApplication()111     CConvert2BlastMaskApplication() {
112         CRef<CVersion> version(new CVersion());
113         version->SetVersionInfo(new CBlastVersion());
114         SetFullVersion(version);
115         m_StopWatch.Start();
116         if (m_UsageReport.IsEnabled()) {
117         	m_UsageReport.AddParam(CBlastUsageReport::eVersion, GetVersion().Print());
118         	m_UsageReport.AddParam(CBlastUsageReport::eProgram, (string) "convert2blastmask");
119         }
120     }
~CConvert2BlastMaskApplication()121     ~CConvert2BlastMaskApplication() {
122     	m_UsageReport.AddParam(CBlastUsageReport::eRunTime, m_StopWatch.Elapsed());
123     }
124 
125 private:
126     virtual void Init(void);
127     virtual int  Run(void);
128     virtual void Exit(void);
129 
130     CMaskFromFasta* x_GetReader();
131     CMaskWriterBlastDbMaskInfo* x_GetWriter();
132 
133     void x_AddCmdOptions();
134 
135     /// Contains the description of this application
136     static const char * const USAGE_LINE;
137     CBlastUsageReport m_UsageReport;
138     CStopWatch m_StopWatch;
139 };
140 
141 const char * const CConvert2BlastMaskApplication::USAGE_LINE
142     = "Convert masking information in lower-case masked FASTA input to file formats suitable for makeblastdb";
143 
Init(void)144 void CConvert2BlastMaskApplication::Init(void) {
145     HideStdArgs(fHideLogfile | fHideConffile | fHideFullVersion | fHideXmlHelp | fHideDryRun);
146 
147     // Create command-line argument descriptions class
148     auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
149 
150     // Specify USAGE context
151     arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
152                               USAGE_LINE);
153 
154     arg_desc->AddDefaultKey("in", "input_file_name",
155                             "Input file name",
156                             CArgDescriptions::eInputFile, "-");
157 
158     arg_desc->AddDefaultKey("out", "output_file_name",
159                             "Output file name",
160                             CArgDescriptions::eOutputFile, "-");
161 
162     arg_desc->AddDefaultKey("outfmt", "output_format",
163                             "Output file format",
164                             CArgDescriptions::eString, "maskinfo_asn1_text");
165 
166     CArgAllow_Strings* strings_allowed = new CArgAllow_Strings();
167     strings_allowed->Allow("maskinfo_asn1_text");
168     strings_allowed->Allow("maskinfo_asn1_bin");
169     strings_allowed->Allow("maskinfo_xml");
170     strings_allowed->Allow("interval");
171     arg_desc->SetConstraint("outfmt", strings_allowed);
172 
173     arg_desc->AddFlag      ( "parse_seqids",
174                              "Parse Seq-ids in FASTA input", true );
175 
176     arg_desc->AddKey       ("masking_algorithm", "mask_program_name",
177                             "Masking algorithm name (e.g.: dust, seg, "
178                             "windowmasker, repeat). Use 'other' for "
179                             "user-defined type",
180                             CArgDescriptions::eString);
181 
182     arg_desc->AddKey     ("masking_options", "mask_program_options",
183                           "Masking algorithm options to create the masked input"
184                           " (free text to describe/include (command line) "
185                           "options used to create the masking)",
186                           CArgDescriptions::eString);
187 
188     // Setup arg.descriptions for this application
189     SetupArgDescriptions(arg_desc.release());
190 }
191 
192 CMaskFromFasta*
x_GetReader()193 CConvert2BlastMaskApplication::x_GetReader() {
194     const CArgs& args = GetArgs();
195     CNcbiIstream& input = args["in"].AsInputFile();
196     return(new CMaskFromFasta(input, args["parse_seqids"]));
197 }
198 
199 CMaskWriterBlastDbMaskInfo*
x_GetWriter()200 CConvert2BlastMaskApplication::x_GetWriter() {
201     const CArgs& args = GetArgs();
202     const string& format(args["outfmt"].AsString());
203     CNcbiOstream& output = args["out"].AsOutputFile();
204 
205     string algo=args["masking_algorithm"].AsString();
206     NStr::ToLower(algo);
207     EBlast_filter_program prog;
208     if     (algo == "not_set"     ) prog = eBlast_filter_program_not_set;
209     else if(algo == "dust"        ) prog = eBlast_filter_program_dust;
210     else if(algo == "seg"         ) prog = eBlast_filter_program_seg;
211     else if(algo == "windowmasker") prog = eBlast_filter_program_windowmasker;
212     else if(algo == "repeat"      ) prog = eBlast_filter_program_repeat;
213     else                            prog = eBlast_filter_program_other;
214 
215     return(new CMaskWriterBlastDbMaskInfo(output, format, 0,
216                       prog, args["masking_options"].AsString()));
217 }
218 
Run(void)219 int CConvert2BlastMaskApplication::Run(void) {
220     int retval = 0;
221 
222     try {
223         auto_ptr<CMaskFromFasta> reader(x_GetReader());
224         auto_ptr<CMaskWriterBlastDbMaskInfo> writer(x_GetWriter());
225 
226         while (reader->GetNextSequence()) {
227             if(reader->HasMask()) writer->Print(reader->GetBestID(), reader->GetMask());
228         }
229     } catch (const CException& e) {
230         cerr << e.what() << endl;
231         retval = 1;
232     }
233     x_AddCmdOptions();
234     m_UsageReport.AddParam(CBlastUsageReport::eExitStatus, retval);
235     return retval;
236 }
237 
Exit(void)238 void CConvert2BlastMaskApplication::Exit(void)
239 {
240     SetDiagStream(0);
241 }
242 
x_AddCmdOptions()243 void CConvert2BlastMaskApplication::x_AddCmdOptions()
244 {
245 	const CArgs & args = GetArgs();
246     if (args["masking_algorithm"].HasValue()) {
247     	 m_UsageReport.AddParam(CBlastUsageReport::eMaskAlgo, args["masking_algorithm"].AsString());
248     }
249     if (args["outfmt"].HasValue()) {
250     	 m_UsageReport.AddParam(CBlastUsageReport::eOutputFmt, args["outfmt"].AsString());
251     }
252     if (args["parse_seqids"].HasValue()) {
253     	 m_UsageReport.AddParam(CBlastUsageReport::eParseSeqIDs, true);
254     }
255 
256 }
257 
258 #ifndef SKIP_DOXYGEN_PROCESSING
main(int argc,const char * argv[])259 int main(int argc, const char* argv[])
260 {
261     // Execute main application function
262     return CConvert2BlastMaskApplication().AppMain(argc, argv);
263 }
264 #endif /* SKIP_DOXYGEN_PROCESSING */
265 
266