1 #include "myutils.h"
2 #include "chime.h"
3 #include "seqdb.h"
4 #include "dp.h"
5 #include "ultra.h"
6 #include "hspfinder.h"
7 #include <algorithm>
8 #include <set>
9
10 bool SearchChime(Ultra &U, const SeqData &QSD, float QAb,
11 const AlnParams &AP, const AlnHeuristics &AH, HSPFinder &HF,
12 float MinFractId, ChimeHit2 &Hit);
13
14 FILE *g_fUChime;
15 FILE *g_fUChimeAlns;
16 const vector<float> *g_SortVecFloat;
17 bool g_UchimeDeNovo = false;
18
Usage()19 void Usage()
20 {
21 printf("\n");
22 printf("UCHIME %s by Robert C. Edgar\n", MY_VERSION);
23 printf("http://www.drive5.com/uchime\n");
24 printf("\n");
25 printf("This software is donated to the public domain\n");
26 printf("\n");
27
28 printf(
29 #include "help.h"
30 );
31 }
32
SetBLOSUM62()33 void SetBLOSUM62()
34 {
35 Die("SetBLOSUM62 not implemented");
36 }
37
ReadSubstMx(const string &,Mx<float> &)38 void ReadSubstMx(const string &/*FileName*/, Mx<float> &/*Mxf*/)
39 {
40 Die("ReadSubstMx not implemented");
41 }
42
LogAllocs()43 void LogAllocs()
44 {
45 /*empty*/
46 }
47
CmpDescVecFloat(unsigned i,unsigned j)48 static bool CmpDescVecFloat(unsigned i, unsigned j)
49 {
50 return (*g_SortVecFloat)[i] > (*g_SortVecFloat)[j];
51 }
52
Range(vector<unsigned> & v,unsigned N)53 void Range(vector<unsigned> &v, unsigned N)
54 {
55 v.clear();
56 v.reserve(N);
57 for (unsigned i = 0; i < N; ++i)
58 v.push_back(i);
59 }
60
SortDescending(const vector<float> & Values,vector<unsigned> & Order)61 void SortDescending(const vector<float> &Values, vector<unsigned> &Order)
62 {
63 StartTimer(Sort);
64 const unsigned N = SIZE(Values);
65 Range(Order, N);
66 g_SortVecFloat = &Values;
67 sort(Order.begin(), Order.end(), CmpDescVecFloat);
68 EndTimer(Sort);
69 }
70
GetAbFromLabel(const string & Label)71 float GetAbFromLabel(const string &Label)
72 {
73 vector<string> Fields;
74 Split(Label, Fields, '/');
75 const unsigned N = SIZE(Fields);
76 for (unsigned i = 0; i < N; ++i)
77 {
78 const string &Field = Fields[i];
79 if (Field.substr(0, 3) == "ab=")
80 {
81 string a = Field.substr(3, string::npos);
82 return (float) atof(a.c_str());
83 }
84 }
85 if (g_UchimeDeNovo)
86 Die("Missing abundance /ab=xx/ in label >%s", Label.c_str());
87 return 0.0;
88 }
89
main(int argc,char * argv[])90 int main(int argc, char *argv[])
91 {
92
93 MyCmdLine(argc, argv);
94
95 if (argc < 2)
96 {
97 Usage();
98 return 0;
99 }
100
101 if (opt_version)
102 {
103 printf("uchime v" MY_VERSION ".%s\n", SVN_VERSION);
104 return 0;
105 }
106
107 printf("uchime v" MY_VERSION ".%s\n", SVN_VERSION);
108 printf("by Robert C. Edgar\n");
109 printf("http://drive5.com/uchime\n");
110 printf("This code is donated to the public domain.\n");
111 printf("\n");
112 if (!optset_w)
113 opt_w = 8;
114
115 float MinFractId = 0.95f;
116 if (optset_id)
117 MinFractId = (float) opt_id;
118
119 Log("%8.2f minh\n", opt_minh);
120 Log("%8.2f xn\n", opt_xn);
121 Log("%8.2f dn\n", opt_dn);
122 Log("%8.2f xa\n", opt_xa);
123 Log("%8.2f mindiv\n", opt_mindiv);
124 Log("%8u maxp\n", opt_maxp);
125
126 if (opt_input == "" && opt_uchime != "")
127 opt_input = opt_uchime;
128
129 if (opt_input == "")
130 Die("Missing --input");
131
132 g_UchimeDeNovo = (opt_db == "");
133
134 if (opt_uchimeout != "")
135 g_fUChime = CreateStdioFile(opt_uchimeout);
136
137 if (opt_uchimealns != "")
138 g_fUChimeAlns = CreateStdioFile(opt_uchimealns);
139
140 SeqDB Input;
141 SeqDB DB;
142
143 Input.FromFasta(opt_input);
144 if (!Input.IsNucleo())
145 Die("Input contains amino acid sequences");
146
147 const unsigned QuerySeqCount = Input.GetSeqCount();
148 vector<unsigned> Order;
149 for (unsigned i = 0; i < QuerySeqCount; ++i)
150 Order.push_back(i);
151
152 if (g_UchimeDeNovo)
153 {
154 vector<float> Abs;
155 for (unsigned i = 0; i < QuerySeqCount; ++i)
156 {
157 const char *Label = Input.GetLabel(i);
158 float Ab = GetAbFromLabel(Label);
159 Abs.push_back(Ab);
160 }
161 SortDescending(Abs, Order);
162 DB.m_IsNucleoSet = true;
163 DB.m_IsNucleo = true;
164 }
165 else
166 {
167 DB.FromFasta(opt_db);
168 if (!DB.IsNucleo())
169 Die("Database contains amino acid sequences");
170 }
171
172 vector<ChimeHit2> Hits;
173 unsigned HitCount = 0;
174 for (unsigned i = 0; i < QuerySeqCount; ++i)
175 {
176 unsigned QuerySeqIndex = Order[i];
177
178 SeqData QSD;
179 Input.GetSeqData(QuerySeqIndex, QSD);
180
181 float QAb = -1.0;
182 if (g_UchimeDeNovo)
183 QAb = GetAbFromLabel(QSD.Label);
184
185 ChimeHit2 Hit;
186 AlnParams &AP = *(AlnParams *) 0;
187 AlnHeuristics &AH = *(AlnHeuristics *) 0;
188 HSPFinder &HF = *(HSPFinder *) 0;
189 bool Found = SearchChime(DB, QSD, QAb, AP, AH, HF, MinFractId, Hit);
190 if (Found)
191 ++HitCount;
192 else
193 {
194 if (g_UchimeDeNovo)
195 DB.AddSeq(QSD.Label, QSD.Seq, QSD.L);
196 }
197
198 WriteChimeHit(g_fUChime, Hit);
199
200 ProgressStep(i, QuerySeqCount, "%u/%u chimeras found (%.1f%%)", HitCount, i, Pct(HitCount, i+1));
201 }
202
203 Log("\n");
204 Log("%s: %u/%u chimeras found (%.1f%%)\n",
205 opt_input.c_str(), HitCount, QuerySeqCount, Pct(HitCount, QuerySeqCount));
206
207 CloseStdioFile(g_fUChime);
208 CloseStdioFile(g_fUChimeAlns);
209
210 ProgressExit();
211 return 0;
212 }
213