1 /* $Id: cpgdemo.cpp 103491 2007-05-04 17:18:18Z kazimird $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Philip Johnson
27 *
28 * File Description: cpgdemo -- demo program for cpg island finder
29 *
30 * ===========================================================================
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <algo/sequence/cpg.hpp>
35
36 #include <corelib/ncbiapp.hpp>
37 #include <corelib/ncbiargs.hpp>
38 #include <corelib/ncbienv.hpp>
39
40 #include <objmgr/object_manager.hpp>
41 #include <objtools/data_loaders/genbank/gbloader.hpp>
42 #include <objmgr/scope.hpp>
43 #include <objmgr/seq_vector.hpp>
44
45 USING_NCBI_SCOPE;
46 USING_SCOPE(objects);
47
48 class CCpGDemoApp : public CNcbiApplication {
49 public:
CCpGDemoApp(void)50 CCpGDemoApp(void) {DisableArgDescriptions();};
51 virtual void Init(void);
52 virtual int Run(void);
53 };
54
55 /*---------------------------------------------------------------------------*/
56
Init(void)57 void CCpGDemoApp::Init(void)
58 {
59 auto_ptr<CArgDescriptions> argDescr(new CArgDescriptions);
60 argDescr->AddDefaultKey("gc", "gcContent", "calibrated organism %GC "
61 "content (ie. human: 50, rat: 48)",
62 CArgDescriptions::eInteger, "50");
63 argDescr->AddDefaultKey("cpg", "obsexp",
64 "observed / expected CpG percentage",
65 CArgDescriptions::eInteger, "60");
66 argDescr->AddDefaultKey("win", "window_size",
67 "width of sliding window",
68 CArgDescriptions::eInteger, "200");
69 argDescr->AddDefaultKey("len", "min_length",
70 "minimum length of an island",
71 CArgDescriptions::eInteger, "500");
72 argDescr->AddOptionalKey("m", "merge_isles",
73 "merge adjacent islands within the specified "
74 "distance of each other",
75 CArgDescriptions::eInteger);
76
77
78
79 argDescr->AddOptionalKey("a", "accession",
80 "Single accession", CArgDescriptions::eString);
81 argDescr->AddOptionalKey("i", "infile",
82 "List of accessions",
83 CArgDescriptions::eInputFile);
84 argDescr->AddExtra(0,99, "fasta files", CArgDescriptions::eInputFile);
85
86
87 argDescr->SetUsageContext(GetArguments().GetProgramBasename(),
88 "Scans sequences for CpG islands; uses algorithm based upon Takai & Jones, 2002. Output sent to stdout.\n", false);
89 SetupArgDescriptions(argDescr.release());
90 }
91
92 //---------------------------------------------------------------------------
93 // PRE : open output stream, populated CpG island struct
94 // POST: <cpg start> <cpg stop> <%G + C> <obs/exp CpG>
operator <<(CNcbiOstream & o,SCpGIsland i)95 CNcbiOstream& operator<< (CNcbiOstream& o, SCpGIsland i)
96 {
97 unsigned int len = i.m_Stop - i.m_Start + 1;
98 o << i.m_Start << "\t" << i.m_Stop << "\t" <<
99 (double) (i.m_C + i.m_G) / len << "\t" <<
100 (double) i.m_CG * len / (i.m_C * i.m_G);
101 return o;
102 };
103
104 //---------------------------------------------------------------------------
105 // PRE : accession to scan, scope in which to resolve accession, CArgs
106 // containing cpg island-scanning parameters
107 // POST: 0 if successful, any islands found in the given accession according
108 // to the arguments have been sent to cout
ScanForCpGs(const string & acc,CScope & scope,const CArgs & args)109 int ScanForCpGs(const string& acc, CScope &scope, const CArgs& args)
110 {
111 CRef<CSeq_id> seq_id;
112 try {
113 seq_id.Reset(new CSeq_id(acc));
114 } catch (CSeqIdException& e) {
115 cerr << "Invalid seq-id: '" << acc << "': " << e.what() << endl;
116 return 1;
117 }
118
119 CBioseq_Handle bioseq_handle = scope.GetBioseqHandle(*seq_id);
120
121 if (!bioseq_handle) {
122 cerr << "Bioseq load FAILED." << endl;
123 return 2;
124 }
125
126 CSeqVector sv =
127 bioseq_handle.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
128 string seqString;
129 seqString.reserve(sv.size());
130 sv.GetSeqData(0, sv.size(), seqString);
131
132 CCpGIslands cpgIsles(seqString.data(), seqString.length(),
133 args["win"].AsInteger(), args["len"].AsInteger(),
134 args["gc"].AsInteger(), args["cpg"].AsInteger());
135 if (args["m"]) {
136 cpgIsles.MergeIslesWithin(args["m"].AsInteger());
137 }
138
139 ITERATE(CCpGIslands::TIsles, i, cpgIsles.GetIsles()) {
140 cout << acc << "\t" << *i << endl;
141 }
142
143 return 0;
144 }
145
146 //---------------------------------------------------------------------------
147 // PRE : fasta file to scan, scope in which to resolve accession, CArgs
148 // containing cpg island-scanning parameters
149 // POST: 0 if successful, any islands found in the given accession according
150 // to the arguments have been sent to cout
ScanForCpGs(istream & infile,const CArgs & args)151 int ScanForCpGs(istream &infile, const CArgs &args)
152 {
153 string localID;
154
155 infile >> localID;
156 //skip rest of line
157 infile.ignore(numeric_limits<int>::max(), '\n');
158
159 while (infile) {
160 if (localID[0] != '>') {
161 ERR_POST(Critical << "FASTA file garbled around '" <<localID<<"'");
162 return 1;
163 }
164
165 //read in nucleotides
166 string seqString, lineBuff;
167 while (!infile.eof() && infile.peek() != '>') {
168 getline(infile, lineBuff);
169 if (seqString.size() + lineBuff.size() > seqString.capacity())
170 seqString.reserve(seqString.capacity() * 2);
171 seqString += lineBuff;
172 }
173
174 //scan
175 CCpGIslands cpgIsles(seqString.data(), seqString.length(),
176 args["win"].AsInteger(), args["len"].AsInteger(),
177 args["gc"].AsInteger(), args["cpg"].AsInteger());
178 if (args["m"]) {
179 cpgIsles.MergeIslesWithin(args["m"].AsInteger());
180 }
181
182 //output
183 ITERATE(CCpGIslands::TIsles, i, cpgIsles.GetIsles()) {
184 cout << localID << "\t" << *i << endl;
185 }
186
187 infile >> localID;
188 infile.ignore(numeric_limits<int>::max(), '\n');
189 }
190
191 return 0;
192 }
193
194
195 /*---------------------------------------------------------------------------*/
196
Run(void)197 int CCpGDemoApp::Run(void)
198 {
199 const CArgs& args = GetArgs();
200 CRef<CObjectManager> objMgr = CObjectManager::GetInstance();
201 CGBDataLoader::RegisterInObjectManager(*objMgr);
202
203 CScope scope(*objMgr);
204 scope.AddDefaults();
205 int retCode = 0;
206
207 cout.precision(2);
208 cout.setf(ios::fixed, ios::floatfield);
209 cout << "# CpG islands. Win:" << args["win"].AsInteger()
210 << "; Min len:" << args["len"].AsInteger() << "; Min %GC:"
211 << args["gc"].AsDouble() << "; Min obs/exp CpG: "
212 << args["cpg"].AsDouble();
213 if (args["m"]) {
214 cout << "; Merge islands within: " << args["m"].AsInteger();
215 }
216 cout << endl;
217 cout << "# label\tisle_start\tisle_stop\t%GC\tobs/exp CpG" << endl;
218
219 if (args["a"]) {
220 retCode = ScanForCpGs(args["a"].AsString(), scope, args);
221 }
222
223 if (args["i"]) {
224 istream &infile = args["i"].AsInputFile();
225 string acc;
226
227 infile >> acc;
228 while (infile.good()) {
229 cerr << "Processing " << acc << endl;
230 if (ScanForCpGs(acc, scope, args) != 0) {
231 retCode = 3;
232 }
233 infile >> acc;
234 }
235 }
236
237 for (unsigned int i = 1; i <= args.GetNExtra(); ++i) {
238 if (!ScanForCpGs(args[i].AsInputFile(), args)) {
239 retCode = 3;
240 }
241 }
242
243 return retCode;
244 }
245
246 //---------------------------------------------------------------------------
main(int argc,char ** argv)247 int main(int argc, char** argv)
248 {
249 CCpGDemoApp theApp;
250 return theApp.AppMain(argc, argv, NULL, eDS_Default, 0);
251 }
252