1 #include "myutils.h"
2 #include "ultra.h"
3 #include "chime.h"
4 #include "uc.h"
5 #include "dp.h"
6 #include <set>
7 #include <algorithm>
8
9 #define TRACE 0
10
11 extern FILE *g_fUChime;
12
13 void GetCandidateParents(Ultra &U, const SeqData &QSD, float AbQ,
14 vector<unsigned> &Parents);
15
16 void AlignChime(const SeqData &QSD, const SeqData &ASD, const SeqData &BSD,
17 const string &PathQA, const string &PathQB, ChimeHit2 &Hit);
18
19 double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path, bool Nucleo);
20
GetSmoothedIdVec(const SeqData & QSD,const SeqData & PSD,const string & Path,vector<unsigned> & IdVec,unsigned d)21 static void GetSmoothedIdVec(const SeqData &QSD, const SeqData &PSD, const string &Path,
22 vector<unsigned> &IdVec, unsigned d)
23 {
24 IdVec.clear();
25 const unsigned ColCount = SIZE(Path);
26
27 const byte *Q = QSD.Seq;
28 const byte *P = PSD.Seq;
29
30 const unsigned QL = QSD.L;
31 const unsigned PL = PSD.L;
32
33 if (QL <= d)
34 {
35 IdVec.resize(QSD.L, 0);
36 return;
37 }
38
39 unsigned QPos = 0;
40 unsigned PPos = 0;
41
42 vector<bool> SameVec;
43 SameVec.reserve(QL);
44 for (unsigned Col = 0; Col < ColCount; ++Col)
45 {
46 char c = Path[Col];
47
48 bool Same = false;
49 if (c == 'M')
50 {
51 byte q = Q[QPos];
52 byte p = P[PPos];
53 Same = (toupper(q) == toupper(p));
54 }
55
56 if (c == 'M' || c == 'D')
57 {
58 ++QPos;
59 SameVec.push_back(Same);
60 }
61
62 if (c == 'M' || c == 'I')
63 ++PPos;
64 }
65
66 asserta(SIZE(SameVec) == QL);
67
68 unsigned n = 0;
69 for (unsigned QPos = 0; QPos < d; ++QPos)
70 {
71 if (SameVec[QPos])
72 ++n;
73 IdVec.push_back(n);
74 }
75
76 for (unsigned QPos = d; QPos < QL; ++QPos)
77 {
78 if (SameVec[QPos])
79 ++n;
80 IdVec.push_back(n);
81 if (SameVec[QPos-d])
82 --n;
83 }
84 asserta(SIZE(IdVec) == QL);
85
86 #if TRACE
87 {
88 Log("\n");
89 Log("GetSmoothedIdVec\n");
90 unsigned QPos = 0;
91 unsigned PPos = 0;
92 Log("Q P Same Id\n");
93 Log("- - ---- -------\n");
94 for (unsigned Col = 0; Col < ColCount; ++Col)
95 {
96 char c = Path[Col];
97
98 bool Same = false;
99 if (c == 'M')
100 {
101 byte q = Q[QPos];
102 byte p = P[PPos];
103 Same = (toupper(q) == toupper(p));
104 Log("%c %c %4c %7d\n", q, p, tof(Same), IdVec[QPos]);
105 }
106
107 if (c == 'M' || c == 'D')
108 ++QPos;
109 if (c == 'M' || c == 'I')
110 ++PPos;
111 }
112 }
113 #endif
114 }
115
SearchChime(Ultra & U,const SeqData & QSD,float QAb,const AlnParams & AP,const AlnHeuristics & AH,HSPFinder & HF,float MinFractId,ChimeHit2 & Hit)116 bool SearchChime(Ultra &U, const SeqData &QSD, float QAb,
117 const AlnParams &AP, const AlnHeuristics &AH, HSPFinder &HF,
118 float MinFractId, ChimeHit2 &Hit)
119 {
120 Hit.Clear();
121 Hit.QLabel = QSD.Label;
122
123 if (opt_verbose)
124 {
125 Log("\n");
126 Log("SearchChime()\n");
127 Log("Query>%s\n", QSD.Label);
128 }
129
130 vector<unsigned> Parents;
131 GetCandidateParents(U, QSD, QAb, Parents);
132
133 unsigned ParentCount = SIZE(Parents);
134 if (ParentCount <= 1)
135 {
136 if (opt_verbose)
137 Log("%u candidate parents, done.\n", ParentCount);
138 return false;
139 }
140
141 if (opt_fastalign)
142 HF.SetA(QSD);
143 HSPFinder *ptrHF = (opt_fastalign ? &HF : 0);
144
145 unsigned ChunkLength;
146 vector<unsigned> ChunkLos;
147 GetChunkInfo(QSD.L, ChunkLength, ChunkLos);
148 const unsigned ChunkCount = SIZE(ChunkLos);
149
150 vector<unsigned> ChunkIndexToBestId(ChunkCount, 0);
151 vector<unsigned> ChunkIndexToBestParentIndex(ChunkCount, UINT_MAX);
152
153 vector<SeqData> PSDs;
154 vector<string> Paths;
155 double TopPctId = 0.0;
156 unsigned TopParentIndex = UINT_MAX;
157 unsigned QL = QSD.L;
158 vector<unsigned> MaxIdVec(QL, 0);
159 for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex)
160 {
161 unsigned ParentSeqIndex = Parents[ParentIndex];
162
163 SeqData PSD;
164 //PSD.Label = U.GetSeedLabel(ParentSeqIndex);
165 //PSD.Seq = U.GetSeedSeq(ParentSeqIndex);
166 //PSD.L = U.GetSeedLength(ParentSeqIndex);
167 //PSD.Index = ParentSeqIndex;
168 U.GetSeqData(ParentSeqIndex, PSD);
169 PSDs.push_back(PSD);
170
171 if (opt_fastalign)
172 HF.SetB(PSD);
173
174 PathData PD;
175
176 float HSPId;
177 bool Found = GlobalAlign(QSD, PSD, AP, AH, *ptrHF, MinFractId, HSPId, PD);
178 if (!Found)
179 {
180 Paths.push_back("");
181 continue;
182 }
183
184 double PctId = 100.0*GetFractIdGivenPath(QSD.Seq, PSD.Seq, PD.Start, true);
185 if (opt_selfid && PctId == 100.0)
186 {
187 Paths.push_back("");
188 continue;
189 }
190
191 if (PctId > TopPctId)
192 {
193 TopParentIndex = ParentIndex;
194 TopPctId = PctId;
195 if (TopPctId >= 100.0 - opt_mindiv)
196 {
197 if (opt_verbose)
198 {
199 Log(" %.1f%% >%s\n", TopPctId, PSD.Label);
200 Log(" Top hit exceeds ctl threshold, done.\n");
201 return false;
202 }
203 }
204 }
205
206 string Path = PD.Start;
207 Paths.push_back(Path);
208
209 vector<unsigned> IdVec;
210 GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);
211
212 for (unsigned QPos = 0; QPos < QL; ++QPos)
213 if (IdVec[QPos] > MaxIdVec[QPos])
214 MaxIdVec[QPos] = IdVec[QPos];
215 }
216
217 vector<unsigned> BestParents;
218 for (unsigned k = 0; k < opt_maxp; ++k)
219 {
220 unsigned BestParent = UINT_MAX;
221 unsigned BestCov = 0;
222 for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex)
223 {
224 const SeqData &PSD = PSDs[ParentIndex];
225 const string &Path = Paths[ParentIndex];
226 if (Path == "")
227 continue;
228
229 vector<unsigned> IdVec;
230 GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);
231
232 unsigned Cov = 0;
233 for (unsigned QPos = 0; QPos < QL; ++QPos)
234 if (IdVec[QPos] == MaxIdVec[QPos])
235 ++Cov;
236
237 if (Cov > BestCov)
238 {
239 BestParent = ParentIndex;
240 BestCov = Cov;
241 }
242 }
243
244 if (BestParent == UINT_MAX)
245 break;
246
247 BestParents.push_back(BestParent);
248 vector<unsigned> IdVec;
249
250 const SeqData &PSD = PSDs[BestParent];
251 const string &Path = Paths[BestParent];
252 GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);
253 for (unsigned QPos = 0; QPos < QL; ++QPos)
254 if (IdVec[QPos] == MaxIdVec[QPos])
255 MaxIdVec[QPos] = UINT_MAX;
256 }
257
258 unsigned BestParentCount = SIZE(BestParents);
259
260 if (opt_verbose)
261 {
262 Log("%u/%u best parents\n", BestParentCount, ParentCount);
263 for (unsigned k = 0; k < BestParentCount; ++k)
264 {
265 unsigned i = BestParents[k];
266 Log(" %s\n", PSDs[i].Label);
267 }
268 }
269
270 bool Found = false;
271 for (unsigned k1 = 0; k1 < BestParentCount; ++k1)
272 {
273 unsigned i1 = BestParents[k1];
274 asserta(i1 < ParentCount);
275
276 const SeqData &PSD1 = PSDs[i1];
277 const string &Path1 = Paths[i1];
278
279 for (unsigned k2 = k1 + 1; k2 < BestParentCount; ++k2)
280 {
281 unsigned i2 = BestParents[k2];
282 asserta(i2 < ParentCount);
283 asserta(i2 != i1);
284
285 const SeqData &PSD2 = PSDs[i2];
286 const string &Path2 = Paths[i2];
287
288 ChimeHit2 Hit2;
289 AlignChime(QSD, PSD1, PSD2, Path1, Path2, Hit2);
290 Hit2.PctIdQT = TopPctId;
291
292 if (Hit2.Accept())
293 Found = true;
294
295 if (Hit2.Score > Hit.Score)
296 Hit = Hit2;
297
298 if (opt_verbose)
299 Hit2.LogMe();
300 }
301 }
302
303 return Found;
304 }
305