1 #include "myutils.h"
2 #include "ultra.h"
3 #include "chime.h"
4 #include "uc.h"
5 #include "dp.h"
6 #include <set>
7 #include <algorithm>
8 
9 #define TRACE	0
10 
11 extern FILE *g_fUChime;
12 
13 void GetCandidateParents(Ultra &U, const SeqData &QSD, float AbQ,
14   vector<unsigned> &Parents);
15 
16 void AlignChime(const SeqData &QSD, const SeqData &ASD, const SeqData &BSD,
17   const string &PathQA, const string &PathQB, ChimeHit2 &Hit);
18 
19 double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path, bool Nucleo);
20 
GetSmoothedIdVec(const SeqData & QSD,const SeqData & PSD,const string & Path,vector<unsigned> & IdVec,unsigned d)21 static void GetSmoothedIdVec(const SeqData &QSD, const SeqData &PSD, const string &Path,
22   vector<unsigned> &IdVec, unsigned d)
23 	{
24 	IdVec.clear();
25 	const unsigned ColCount = SIZE(Path);
26 
27 	const byte *Q = QSD.Seq;
28 	const byte *P = PSD.Seq;
29 
30 	const unsigned QL = QSD.L;
31 	const unsigned PL = PSD.L;
32 
33 	if (QL <= d)
34 		{
35 		IdVec.resize(QSD.L, 0);
36 		return;
37 		}
38 
39 	unsigned QPos = 0;
40 	unsigned PPos = 0;
41 
42 	vector<bool> SameVec;
43 	SameVec.reserve(QL);
44 	for (unsigned Col = 0; Col < ColCount; ++Col)
45 		{
46 		char c = Path[Col];
47 
48 		bool Same = false;
49 		if (c == 'M')
50 			{
51 			byte q = Q[QPos];
52 			byte p = P[PPos];
53 			Same = (toupper(q) == toupper(p));
54 			}
55 
56 		if (c == 'M' || c == 'D')
57 			{
58 			++QPos;
59 			SameVec.push_back(Same);
60 			}
61 
62 		if (c == 'M' || c == 'I')
63 			++PPos;
64 		}
65 
66 	asserta(SIZE(SameVec) == QL);
67 
68 	unsigned n = 0;
69 	for (unsigned QPos = 0; QPos < d; ++QPos)
70 		{
71 		if (SameVec[QPos])
72 			++n;
73 		IdVec.push_back(n);
74 		}
75 
76 	for (unsigned QPos = d; QPos < QL; ++QPos)
77 		{
78 		if (SameVec[QPos])
79 			++n;
80 		IdVec.push_back(n);
81 		if (SameVec[QPos-d])
82 			--n;
83 		}
84 	asserta(SIZE(IdVec) == QL);
85 
86 #if	TRACE
87 	{
88 	Log("\n");
89 	Log("GetSmoothedIdVec\n");
90 	unsigned QPos = 0;
91 	unsigned PPos = 0;
92 	Log("Q P  Same       Id\n");
93 	Log("- -  ----  -------\n");
94 	for (unsigned Col = 0; Col < ColCount; ++Col)
95 		{
96 		char c = Path[Col];
97 
98 		bool Same = false;
99 		if (c == 'M')
100 			{
101 			byte q = Q[QPos];
102 			byte p = P[PPos];
103 			Same = (toupper(q) == toupper(p));
104 			Log("%c %c  %4c  %7d\n", q, p, tof(Same), IdVec[QPos]);
105 			}
106 
107 		if (c == 'M' || c == 'D')
108 			++QPos;
109 		if (c == 'M' || c == 'I')
110 			++PPos;
111 		}
112 	}
113 #endif
114 	}
115 
SearchChime(Ultra & U,const SeqData & QSD,float QAb,const AlnParams & AP,const AlnHeuristics & AH,HSPFinder & HF,float MinFractId,ChimeHit2 & Hit)116 bool SearchChime(Ultra &U, const SeqData &QSD, float QAb,
117   const AlnParams &AP, const AlnHeuristics &AH, HSPFinder &HF,
118   float MinFractId, ChimeHit2 &Hit)
119 	{
120 	Hit.Clear();
121 	Hit.QLabel = QSD.Label;
122 
123 	if (opt_verbose)
124 		{
125 		Log("\n");
126 		Log("SearchChime()\n");
127 		Log("Query>%s\n", QSD.Label);
128 		}
129 
130 	vector<unsigned> Parents;
131 	GetCandidateParents(U, QSD, QAb, Parents);
132 
133 	unsigned ParentCount = SIZE(Parents);
134 	if (ParentCount <= 1)
135 		{
136 		if (opt_verbose)
137 			Log("%u candidate parents, done.\n", ParentCount);
138 		return false;
139 		}
140 
141 	if (opt_fastalign)
142 		HF.SetA(QSD);
143 	HSPFinder *ptrHF = (opt_fastalign ? &HF : 0);
144 
145 	unsigned ChunkLength;
146 	vector<unsigned> ChunkLos;
147 	GetChunkInfo(QSD.L, ChunkLength, ChunkLos);
148 	const unsigned ChunkCount = SIZE(ChunkLos);
149 
150 	vector<unsigned> ChunkIndexToBestId(ChunkCount, 0);
151 	vector<unsigned> ChunkIndexToBestParentIndex(ChunkCount, UINT_MAX);
152 
153 	vector<SeqData> PSDs;
154 	vector<string> Paths;
155 	double TopPctId = 0.0;
156 	unsigned TopParentIndex = UINT_MAX;
157 	unsigned QL = QSD.L;
158 	vector<unsigned> MaxIdVec(QL, 0);
159 	for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex)
160 		{
161 		unsigned ParentSeqIndex = Parents[ParentIndex];
162 
163 		SeqData PSD;
164 		//PSD.Label = U.GetSeedLabel(ParentSeqIndex);
165 		//PSD.Seq = U.GetSeedSeq(ParentSeqIndex);
166 		//PSD.L = U.GetSeedLength(ParentSeqIndex);
167 		//PSD.Index = ParentSeqIndex;
168 		U.GetSeqData(ParentSeqIndex, PSD);
169 		PSDs.push_back(PSD);
170 
171 		if (opt_fastalign)
172 			HF.SetB(PSD);
173 
174 		PathData PD;
175 
176 		float HSPId;
177 		bool Found = GlobalAlign(QSD, PSD, AP, AH, *ptrHF, MinFractId, HSPId, PD);
178 		if (!Found)
179 			{
180 			Paths.push_back("");
181 			continue;
182 			}
183 
184 		double PctId = 100.0*GetFractIdGivenPath(QSD.Seq, PSD.Seq, PD.Start, true);
185 		if (opt_selfid && PctId == 100.0)
186 			{
187 			Paths.push_back("");
188 			continue;
189 			}
190 
191 		if (PctId > TopPctId)
192 			{
193 			TopParentIndex = ParentIndex;
194 			TopPctId = PctId;
195 			if (TopPctId >= 100.0 - opt_mindiv)
196 				{
197 				if (opt_verbose)
198 					{
199 					Log("  %.1f%%  >%s\n", TopPctId, PSD.Label);
200 					Log("  Top hit exceeds ctl threshold, done.\n");
201 					return false;
202 					}
203 				}
204 			}
205 
206 		string Path = PD.Start;
207 		Paths.push_back(Path);
208 
209 		vector<unsigned> IdVec;
210 		GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);
211 
212 		for (unsigned QPos = 0; QPos < QL; ++QPos)
213 			if (IdVec[QPos] > MaxIdVec[QPos])
214 				MaxIdVec[QPos] = IdVec[QPos];
215 		}
216 
217 	vector<unsigned> BestParents;
218 	for (unsigned k = 0; k < opt_maxp; ++k)
219 		{
220 		unsigned BestParent = UINT_MAX;
221 		unsigned BestCov = 0;
222 		for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex)
223 			{
224 			const SeqData &PSD = PSDs[ParentIndex];
225 			const string &Path = Paths[ParentIndex];
226 			if (Path == "")
227 				continue;
228 
229 			vector<unsigned> IdVec;
230 			GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);
231 
232 			unsigned Cov = 0;
233 			for (unsigned QPos = 0; QPos < QL; ++QPos)
234 				if (IdVec[QPos] == MaxIdVec[QPos])
235 					++Cov;
236 
237 			if (Cov > BestCov)
238 				{
239 				BestParent = ParentIndex;
240 				BestCov = Cov;
241 				}
242 			}
243 
244 		if (BestParent == UINT_MAX)
245 			break;
246 
247 		BestParents.push_back(BestParent);
248 		vector<unsigned> IdVec;
249 
250 		const SeqData &PSD = PSDs[BestParent];
251 		const string &Path = Paths[BestParent];
252 		GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow);
253 		for (unsigned QPos = 0; QPos < QL; ++QPos)
254 			if (IdVec[QPos] == MaxIdVec[QPos])
255 				MaxIdVec[QPos] = UINT_MAX;
256 		}
257 
258 	unsigned BestParentCount = SIZE(BestParents);
259 
260 	if (opt_verbose)
261 		{
262 		Log("%u/%u best parents\n", BestParentCount, ParentCount);
263 		for (unsigned k = 0; k < BestParentCount; ++k)
264 			{
265 			unsigned i = BestParents[k];
266 			Log(" %s\n", PSDs[i].Label);
267 			}
268 		}
269 
270 	bool Found = false;
271 	for (unsigned k1 = 0; k1 < BestParentCount; ++k1)
272 		{
273 		unsigned i1 = BestParents[k1];
274 		asserta(i1 < ParentCount);
275 
276 		const SeqData &PSD1 = PSDs[i1];
277 		const string &Path1 = Paths[i1];
278 
279 		for (unsigned k2 = k1 + 1; k2 < BestParentCount; ++k2)
280 			{
281 			unsigned i2 = BestParents[k2];
282 			asserta(i2 < ParentCount);
283 			asserta(i2 != i1);
284 
285 			const SeqData &PSD2 = PSDs[i2];
286 			const string &Path2 = Paths[i2];
287 
288 			ChimeHit2 Hit2;
289 			AlignChime(QSD, PSD1, PSD2, Path1, Path2, Hit2);
290 			Hit2.PctIdQT = TopPctId;
291 
292 			if (Hit2.Accept())
293 				Found = true;
294 
295 			if (Hit2.Score > Hit.Score)
296 				Hit = Hit2;
297 
298 			if (opt_verbose)
299 				Hit2.LogMe();
300 			}
301 		}
302 
303 	return Found;
304 	}
305