1 /* $Id: local_finder.cpp 467698 2015-05-15 15:51:24Z vasilche $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Alexandre Souvorov
27 *
28 * File Description:
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbiapp.hpp>
34 #include <corelib/ncbienv.hpp>
35 #include <corelib/ncbiargs.hpp>
36
37 #include <algo/gnomon/gnomon.hpp>
38 #include <serial/serial.hpp>
39 #include <serial/objostr.hpp>
40 #include <objtools/readers/fasta.hpp>
41 #include <objects/seqfeat/Imp_feat.hpp>
42 #include <objmgr/object_manager.hpp>
43
44 USING_SCOPE(ncbi);
45 USING_SCOPE(ncbi::objects);
46 USING_SCOPE(ncbi::gnomon);
47
48 class CLocalFinderApp : public CNcbiApplication
49 {
50 public:
51 void Init(void);
52 int Run(void);
53 };
54
55
56
Init(void)57 void CLocalFinderApp::Init(void)
58 {
59 // Prepare command line descriptions
60 auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
61
62 arg_desc->AddKey("input", "FastaFile",
63 "File containing FASTA-format sequence",
64 CArgDescriptions::eInputFile);
65
66 arg_desc->AddDefaultKey("from", "From",
67 "From",
68 CArgDescriptions::eInteger,
69 "0");
70
71 arg_desc->AddDefaultKey("to", "To",
72 "To",
73 CArgDescriptions::eInteger,
74 "1000000000");
75
76 arg_desc->AddKey("model", "ModelData",
77 "Model Data",
78 CArgDescriptions::eInputFile);
79
80 arg_desc->AddOptionalKey("align", "Alignments",
81 "Alignments",
82 CArgDescriptions::eInputFile);
83
84 arg_desc->AddFlag("rep", "Repeats");
85
86
87 // Pass argument descriptions to the application
88 //
89
90 SetupArgDescriptions(arg_desc.release());
91 }
92
93
Run(void)94 int CLocalFinderApp::Run(void)
95 {
96 const CArgs& myargs = GetArgs();
97
98 int left = myargs["from"].AsInteger();
99 int right = myargs["to"].AsInteger();
100 bool repeats = myargs["rep"];
101
102
103 //
104 // read our sequence data
105 //
106 CFastaReader fastareader(myargs["input"].AsString());
107 CRef<CSeq_loc> masked_regions;
108 masked_regions = fastareader.SaveMask();
109 CRef<CSeq_entry> se = fastareader.ReadOneSeq();
110
111 if(masked_regions) {
112 CBioseq& bioseq = se->SetSeq(); // assumes that reader gets only one sequence per fasta id (no [] in file)
113 CRef<CSeq_annot> seq_annot(new CSeq_annot);
114 seq_annot->SetNameDesc("NCBI-FASTA-Lowercase");
115 bioseq.SetAnnot().push_back(seq_annot);
116 CSeq_annot::C_Data::TFtable* feature_table = &seq_annot->SetData().SetFtable();
117 for(CSeq_loc_CI i(*masked_regions); i; ++i) {
118 CRef<CSeq_feat> repeat(new CSeq_feat);
119 CRef<CSeq_id> id(new CSeq_id);
120 id->Assign(i.GetSeq_id());
121 CRef<CSeq_loc> loc(new CSeq_loc(*id, i.GetRange().GetFrom(), i.GetRange().GetTo()));
122 repeat->SetLocation(*loc);
123 repeat->SetData().SetImp().SetKey("repeat_region");
124 feature_table->push_back(repeat);
125 }
126 }
127
128 CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
129 CScope scope(*objmgr);
130 scope.AddTopLevelSeqEntry(*se);
131
132 CRef<CSeq_id> cntg(new CSeq_id);
133 cntg->Assign(*se->GetSeq().GetFirstId());
134 CSeq_loc loc;
135 loc.SetWhole(*cntg);
136 CSeqVector vec(loc, scope);
137 vec.SetIupacCoding();
138
139 CResidueVec seq;
140 ITERATE(CSeqVector,i,vec)
141 seq.push_back(*i);
142
143 // read the alignment information
144 TGeneModelList alignments;
145 if(myargs["align"]) {
146 CNcbiIstream& alignmentfile = myargs["align"].AsInputFile();
147 string our_contig = cntg->GetSeqIdString(true);
148 string cur_contig;
149 CAlignModel algn;
150
151 while(alignmentfile >> algn >> getcontig(cur_contig)) {
152 if (cur_contig==our_contig)
153 alignments.push_back(algn);
154 }
155 }
156
157 // create engine
158 CRef<CHMMParameters> hmm_params(new CHMMParameters(myargs["model"].AsInputFile()));
159 CGnomonEngine gnomon(hmm_params, seq, TSignedSeqRange(left, right));
160
161 // run!
162 gnomon.Run(alignments, repeats, true, true, false, false, 10.0);
163
164 // dump the annotation
165 CRef<CSeq_annot> annot = gnomon.GetAnnot(*cntg);
166 auto_ptr<CObjectOStream> os(CObjectOStream::Open(eSerial_AsnText, cout));
167 *os << *annot;
168
169 return 0;
170
171 }
172
173
174 /////////////////////////////////////////////////////////////////////////////
175 // MAIN
176
177
main(int argc,const char * argv[])178 int main(int argc, const char* argv[])
179 {
180 return CLocalFinderApp().AppMain(argc, argv);
181 }
182
183