1 /*  $Id: cpgdemo.cpp 103491 2007-05-04 17:18:18Z kazimird $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Philip Johnson
27 *
28 * File Description: cpgdemo -- demo program for cpg island finder
29 *
30 * ===========================================================================
31 */
32 
33 #include <ncbi_pch.hpp>
34 #include <algo/sequence/cpg.hpp>
35 
36 #include <corelib/ncbiapp.hpp>
37 #include <corelib/ncbiargs.hpp>
38 #include <corelib/ncbienv.hpp>
39 
40 #include <objmgr/object_manager.hpp>
41 #include <objtools/data_loaders/genbank/gbloader.hpp>
42 #include <objmgr/scope.hpp>
43 #include <objmgr/seq_vector.hpp>
44 
45 USING_NCBI_SCOPE;
46 USING_SCOPE(objects);
47 
48 class CCpGDemoApp : public CNcbiApplication {
49 public:
CCpGDemoApp(void)50     CCpGDemoApp(void) {DisableArgDescriptions();};
51     virtual void Init(void);
52     virtual int Run(void);
53 };
54 
55 /*---------------------------------------------------------------------------*/
56 
Init(void)57 void CCpGDemoApp::Init(void)
58 {
59     auto_ptr<CArgDescriptions> argDescr(new CArgDescriptions);
60     argDescr->AddDefaultKey("gc", "gcContent", "calibrated organism %GC "
61                             "content (ie. human: 50, rat: 48)",
62                             CArgDescriptions::eInteger, "50");
63     argDescr->AddDefaultKey("cpg", "obsexp",
64                             "observed / expected CpG percentage",
65                             CArgDescriptions::eInteger, "60");
66     argDescr->AddDefaultKey("win", "window_size",
67                             "width of sliding window",
68                             CArgDescriptions::eInteger, "200");
69     argDescr->AddDefaultKey("len", "min_length",
70                             "minimum length of an island",
71                             CArgDescriptions::eInteger, "500");
72     argDescr->AddOptionalKey("m", "merge_isles",
73                              "merge adjacent islands within the specified "
74                              "distance of each other",
75                              CArgDescriptions::eInteger);
76 
77 
78 
79     argDescr->AddOptionalKey("a", "accession",
80                              "Single accession", CArgDescriptions::eString);
81     argDescr->AddOptionalKey("i", "infile",
82                              "List of accessions",
83                              CArgDescriptions::eInputFile);
84     argDescr->AddExtra(0,99, "fasta files", CArgDescriptions::eInputFile);
85 
86 
87     argDescr->SetUsageContext(GetArguments().GetProgramBasename(),
88                               "Scans sequences for CpG islands; uses algorithm based upon Takai & Jones, 2002.  Output sent to stdout.\n", false);
89     SetupArgDescriptions(argDescr.release());
90 }
91 
92 //---------------------------------------------------------------------------
93 // PRE : open output stream, populated CpG island struct
94 // POST: <cpg start> <cpg stop> <%G + C> <obs/exp CpG>
operator <<(CNcbiOstream & o,SCpGIsland i)95 CNcbiOstream& operator<< (CNcbiOstream& o, SCpGIsland i)
96 {
97     unsigned int len = i.m_Stop - i.m_Start + 1;
98     o << i.m_Start << "\t" << i.m_Stop << "\t" <<
99         (double) (i.m_C + i.m_G) / len << "\t" <<
100         (double) i.m_CG * len / (i.m_C * i.m_G);
101     return o;
102 };
103 
104 //---------------------------------------------------------------------------
105 // PRE : accession to scan, scope in which to resolve accession, CArgs
106 // containing cpg island-scanning parameters
107 // POST: 0 if successful, any islands found in the given accession according
108 // to the arguments have been sent to cout
ScanForCpGs(const string & acc,CScope & scope,const CArgs & args)109 int ScanForCpGs(const string& acc, CScope &scope, const CArgs& args)
110 {
111     CRef<CSeq_id> seq_id;
112     try {
113         seq_id.Reset(new CSeq_id(acc));
114     } catch (CSeqIdException& e) {
115         cerr << "Invalid seq-id: '" << acc << "': " << e.what() << endl;
116         return 1;
117     }
118 
119     CBioseq_Handle bioseq_handle = scope.GetBioseqHandle(*seq_id);
120 
121     if (!bioseq_handle) {
122         cerr << "Bioseq load FAILED." << endl;
123         return 2;
124     }
125 
126     CSeqVector sv =
127         bioseq_handle.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
128     string seqString;
129     seqString.reserve(sv.size());
130     sv.GetSeqData(0, sv.size(), seqString);
131 
132     CCpGIslands cpgIsles(seqString.data(), seqString.length(),
133                          args["win"].AsInteger(), args["len"].AsInteger(),
134                          args["gc"].AsInteger(), args["cpg"].AsInteger());
135     if (args["m"]) {
136         cpgIsles.MergeIslesWithin(args["m"].AsInteger());
137     }
138 
139     ITERATE(CCpGIslands::TIsles, i, cpgIsles.GetIsles()) {
140         cout << acc << "\t" << *i << endl;
141     }
142 
143     return 0;
144 }
145 
146 //---------------------------------------------------------------------------
147 // PRE : fasta file to scan, scope in which to resolve accession, CArgs
148 // containing cpg island-scanning parameters
149 // POST: 0 if successful, any islands found in the given accession according
150 // to the arguments have been sent to cout
ScanForCpGs(istream & infile,const CArgs & args)151 int ScanForCpGs(istream &infile, const CArgs &args)
152 {
153     string localID;
154 
155     infile >> localID;
156     //skip rest of line
157     infile.ignore(numeric_limits<int>::max(), '\n');
158 
159     while (infile) {
160         if (localID[0] != '>') {
161             ERR_POST(Critical << "FASTA file garbled around '" <<localID<<"'");
162             return 1;
163         }
164 
165         //read in nucleotides
166         string seqString, lineBuff;
167         while (!infile.eof() && infile.peek() != '>') {
168             getline(infile, lineBuff);
169             if (seqString.size() + lineBuff.size() > seqString.capacity())
170                 seqString.reserve(seqString.capacity() * 2);
171             seqString += lineBuff;
172         }
173 
174         //scan
175         CCpGIslands cpgIsles(seqString.data(), seqString.length(),
176                              args["win"].AsInteger(), args["len"].AsInteger(),
177                              args["gc"].AsInteger(), args["cpg"].AsInteger());
178         if (args["m"]) {
179             cpgIsles.MergeIslesWithin(args["m"].AsInteger());
180         }
181 
182         //output
183         ITERATE(CCpGIslands::TIsles, i, cpgIsles.GetIsles()) {
184             cout << localID << "\t" << *i << endl;
185         }
186 
187         infile >> localID;
188         infile.ignore(numeric_limits<int>::max(), '\n');
189     }
190 
191     return 0;
192 }
193 
194 
195 /*---------------------------------------------------------------------------*/
196 
Run(void)197 int CCpGDemoApp::Run(void)
198 {
199     const CArgs& args = GetArgs();
200     CRef<CObjectManager> objMgr = CObjectManager::GetInstance();
201     CGBDataLoader::RegisterInObjectManager(*objMgr);
202 
203     CScope scope(*objMgr);
204     scope.AddDefaults();
205     int retCode = 0;
206 
207     cout.precision(2);
208     cout.setf(ios::fixed, ios::floatfield);
209     cout << "# CpG islands.  Win:" << args["win"].AsInteger()
210          << "; Min len:" << args["len"].AsInteger() << "; Min %GC:"
211          <<  args["gc"].AsDouble() << "; Min obs/exp CpG: "
212          << args["cpg"].AsDouble();
213     if (args["m"]) {
214         cout << "; Merge islands within: " << args["m"].AsInteger();
215     }
216     cout << endl;
217     cout << "# label\tisle_start\tisle_stop\t%GC\tobs/exp CpG" << endl;
218 
219     if (args["a"]) {
220         retCode = ScanForCpGs(args["a"].AsString(), scope, args);
221     }
222 
223     if (args["i"]) {
224         istream &infile = args["i"].AsInputFile();
225         string acc;
226 
227         infile >> acc;
228         while (infile.good()) {
229             cerr << "Processing " << acc << endl;
230             if (ScanForCpGs(acc, scope, args) != 0) {
231                 retCode = 3;
232             }
233             infile >> acc;
234         }
235     }
236 
237     for (unsigned int i = 1; i <= args.GetNExtra(); ++i) {
238         if (!ScanForCpGs(args[i].AsInputFile(), args)) {
239             retCode = 3;
240         }
241     }
242 
243     return retCode;
244 }
245 
246 //---------------------------------------------------------------------------
main(int argc,char ** argv)247 int main(int argc, char** argv)
248 {
249     CCpGDemoApp theApp;
250     return theApp.AppMain(argc, argv, NULL, eDS_Default, 0);
251 }
252