1 #include <sstream>
2 #include <set>
3
4 /** -- fusion functions -- **/
5
6 /**
7 FUSION output is written to OUT_DIR/fusion.txt where 'OUT_DIR' is the specified
8 output directory for kallisto. fusion.txt is a tab separated text file with 10 columns
9 TYPE NAME1 SEQ1 KPOS1 NAME2 SEQ2 KPOS2 INFO POS1 POS2
10
11 TYPE can be one of SPLIT or PAIR, NAME1 and NAME2 are the sequence identifiers.
12 SEQ1 and SEQ2 are the raw read sequences identifie as overlapping the potential fusion.
13 KPOS1 and KPOS2 are the positions within the reads where the k-mers match.
14
15 PAIRs are readpairs where fragment doesn't map, but each read
16 maps independently, potentially the fusion breakpoint is within the fragment, but not sequenced.
17
18 SPLITs are reads where the fusion breakpoint appears within one of the reads
19
20 INFO is NA for PAIR but of the form splitat=(a,b)
21 where a=0,1 indicating whether the first or the second pair is split and b is the 0-based position of the
22 split within the read.
23
24 POS1 and POS2 are comma separated lists of locations of SEQ1 and SEQ2 respectively. Each position is of
25 the form (tx_name,pos,strand) where pos is 0-based from the start of the transcript tx_name and strand
26 is either FW or RE depending on whether the read aligns to the forward or reverse of the transcript.
27 **/
28
printTranscripts(const KmerIndex & index,std::stringstream & o,const std::string s,const std::vector<std::pair<KmerEntry,int>> & v,const std::vector<int> u)29 void printTranscripts(const KmerIndex& index, std::stringstream& o, const std::string s,
30 const std::vector<std::pair<KmerEntry,int>>& v, const std::vector<int> u) {
31
32 Kmer km;
33 KmerEntry val;
34 int p;
35
36 // find first mapping k-mer
37 if (!v.empty()) {
38 p = findFirstMappingKmer(v,val);
39 km = Kmer((s.c_str()+p));
40 }
41
42
43 for (int i = 0; i < u.size(); i++) {
44 int tr = u[i];
45 if (i > 0) {
46 o << ";";
47 }
48 std::pair<int, bool> xp = index.findPosition(tr, km, val, p);
49 o << "(" << index.target_names_[tr] << "," << xp.first << ",";
50 if (xp.second) {
51 o << "FW)";
52 } else {
53 o << "RC)";
54 }
55 }
56 }
57
simpleIntersect(const KmerIndex & index,const std::vector<std::pair<KmerEntry,int>> & v)58 std::vector<int> simpleIntersect(const KmerIndex& index, const std::vector<std::pair<KmerEntry,int>>& v) {
59 if (v.empty()) {
60 return {};
61 }
62 int ec = index.dbGraph.ecs[v[0].first.contig];
63 int lastEC = ec;
64 std::vector<int> u = index.ecmap[ec];
65
66 for (int i = 1; i < v.size(); i++) {
67 if (v[i].first.contig != v[i-1].first.contig) {
68 ec = index.dbGraph.ecs[v[i].first.contig];
69 if (ec != lastEC) {
70 u = index.intersect(ec, u);
71 lastEC = ec;
72 if (u.empty()) {
73 return u;
74 }
75 }
76 }
77 }
78 return u;
79 }
80
81
checkMapability(const KmerIndex & index,const std::string & s,const std::vector<std::pair<KmerEntry,int>> & v,std::vector<int> & u)82 bool checkMapability(const KmerIndex& index, const std::string &s, const std::vector<std::pair<KmerEntry,int>>& v, std::vector<int> &u) {
83 const int maxMismatch = 2;
84 const int maxSoftclip = 5;
85
86 Kmer km;
87 KmerEntry val;
88 int p;
89
90 if (!v.empty()) {
91 p = findFirstMappingKmer(v,val);
92 km = Kmer(s.c_str()+p);
93 } else {
94 return false;
95 }
96
97 std::vector<int> vtmp; vtmp.reserve(u.size());
98
99 for (auto tr : u) {
100 auto trpos = index.findPosition(tr, km, val, p);
101 int tpos = (int)trpos.first;
102 int sz = (int)s.size();
103 bool add = true;
104 if (trpos.second) {
105 if (tpos < 1 || tpos + sz - 1 > index.target_seqs_[tr].size()) {
106 add = false;
107 } else {
108 //std::cout << index.target_seqs_[tr].substr(tpos,sz) << std::endl;
109 //std::cout << s << std::endl;
110 int mis = 0;
111 for (int i = 0; i < sz - maxSoftclip; i++) {
112 if (index.target_seqs_[tr][tpos-1 + i] != s[i]) {
113 ++mis;
114 if (mis > maxMismatch) {
115 break;
116 }
117 }
118 }
119 add = (mis <= maxMismatch);
120 }
121 } else {
122 if (tpos > index.target_seqs_[tr].size() || tpos - sz < 1) {
123 add = false;
124 } else {
125 std::string rs = revcomp(s);
126 //std::cout << index.target_seqs_[tr].substr(tpos - sz, sz) << std::endl;
127 //std::cout << rs << std::endl;
128 int mis = 0;
129 for (int i = sz-1; i >= maxSoftclip; i--) {
130 if (index.target_seqs_[tr][tpos-sz+i] != rs[sz]) {
131 ++mis;
132 if (mis > maxMismatch) {
133 break;
134 }
135 }
136 }
137 add = (mis <= maxMismatch);
138 }
139 }
140
141 if (add) {
142 vtmp.push_back(tr);
143 }
144
145 }
146
147
148 if (vtmp.empty()) {
149 return false;
150 }
151
152 if (vtmp.size() < u.size()) {
153 u = vtmp; // copy
154 }
155
156 return true;
157
158 }
159
160 // returns true if the intersection of the union of EC classes for v1 and v2 respectively is empty
checkUnionIntersection(const KmerIndex & index,const std::string & s1,const std::string & s2,std::pair<int,int> & p1,std::pair<int,int> & p2)161 bool checkUnionIntersection(const KmerIndex& index, const std::string &s1, const std::string &s2, std::pair<int,int> &p1, std::pair<int,int> &p2) {
162 //const std::vector<std::pair<KmerEntry,int>>& v1, const std::vector<std::pair<KmerEntry,int>>& v2) {
163
164 std::set<int> su1,su2;
165
166 auto union_set = [&](const std::string &s, std::pair<int,int> &p) {
167 p = {-1,-1};
168 std::set<int> su;
169
170 KmerIterator kit(s.c_str()), kit_end;
171 int lastEC = -1;
172 for (int i = 0; kit != kit_end; ++i,++kit) {
173 auto search = index.kmap.find(kit->first.rep());
174 if (search != index.kmap.end()) {
175 if (p.first == -1) {
176 p.first = kit->second;
177 p.second = p.first +1;
178 } else {
179 if (p.second + 1 == kit->second) {
180 p.second++;
181 }
182 }
183 const KmerEntry &val = search->second;
184 int ec = index.dbGraph.ecs[val.contig];
185 if (ec != -1 && ec != lastEC) {
186 su.insert(index.ecmap[ec].begin(), index.ecmap[ec].end());
187 lastEC = ec;
188 }
189 }
190 }
191
192 /*
193 int ec = index.dbGraph.ecs[v[0].first.contig];
194 int lastEC = ec;
195 //const std::vector<int> &u = index.ecmap[ec];
196 s.insert(index.ecmap[ec].begin(), index.ecmap[ec].end());
197 for (int i = 1; i < v.size(); i++) {
198 if (v[i].first.contig != v[i-1].first.contig) {
199 ec = index.dbGraph.ecs[v[i].first.contig];
200 if (ec != lastEC) {
201 s.insert(index.ecmap[ec].begin(), index.ecmap[ec].end());
202 lastEC = ec;
203 }
204 }
205 }
206 */
207 return su;
208 };
209
210 su1 = union_set(s1,p1);
211 su2 = union_set(s2,p2);
212
213 if (su1.empty() || su2.empty()) {
214 return false; // TODO, decide on this
215 }
216
217 if (su2.size() < su1.size()) {
218 swap(su1,su2);
219 }
220 for (auto x : su1) {
221 if (su2.count(x) != 0) {
222 return false;
223 }
224 }
225 return true;
226 }
227
228
searchFusion(const KmerIndex & index,const ProgramOptions & opt,const MinCollector & tc,MasterProcessor & mp,int ec,const std::string & n1,const std::string & s1,std::vector<std::pair<KmerEntry,int>> & v1,const std::string & n2,const std::string & s2,std::vector<std::pair<KmerEntry,int>> & v2,bool paired)229 void searchFusion(const KmerIndex &index, const ProgramOptions& opt,
230 const MinCollector& tc, MasterProcessor& mp, int ec,
231 const std::string &n1, const std::string &s1, std::vector<std::pair<KmerEntry,int>> &v1,
232 const std::string &n2, const std::string &s2, std::vector<std::pair<KmerEntry,int>> &v2, bool paired) {
233
234 bool partialMap = false;
235 if (ec != -1) {
236 partialMap = true;
237 }
238 std::stringstream o;
239
240 // no mapping information
241 if (v1.empty() && v2.empty()) {
242 return; // consider splitting in case either is empty
243 }
244
245
246 auto u1 = simpleIntersect(index, v1);
247 auto u2 = simpleIntersect(index, v2);
248
249 // discordant pairs
250 if (!v1.empty() && !v2.empty()) {
251 if (!u1.empty() && !u2.empty()) {
252 std::pair<int,int> p1,p2;
253 if (checkUnionIntersection(index, s1, s2, p1, p2)) {
254 // each pair maps to either end
255 // each pair maps to either end
256 o << "PAIR\t";
257 o << n1 << "\t";
258 o << s1 << "\t";
259 o << p1.first << "," << p1.second << "\t";
260 if (paired) {
261 o << n2 << "\t";
262 o << s2 << "\t";
263 o << p2.first << "," << p2.second << "\t\tNA\t";
264 } else {
265 o << "\t\t\tNA\t";
266 }
267 printTranscripts(index, o, s1, v1, u1); o << "\t"; printTranscripts(index, o, s2, v2, u2);
268 mp.outputFusion(o);
269 return;
270 }
271 }
272 }
273
274 if ((v1.empty() && !u2.empty()) || (v2.empty() && !u1.empty())) {
275 // read was trimmed
276 partialMap = true;
277 return;
278 }
279
280 // ok so ec == -1 and not both v1 and v2 are empty
281 // exactly one of u1 and u2 are empty
282 std::vector<std::pair<KmerEntry,int>> vsafe, vsplit;
283 // sort v1 and v2 by read position
284 auto vsorter = [&](std::pair<KmerEntry, int> a, std::pair<KmerEntry, int> b) {
285 return a.second < b.second;
286 };
287
288 std::sort(v1.begin(), v1.end(), vsorter);
289 std::sort(v2.begin(), v2.end(), vsorter);
290 bool split1 = true;
291 if (!v1.empty() && !v2.empty()) {
292 if (u1.empty()) {
293 std::copy(v1.begin(), v1.end(), std::back_inserter(vsplit));
294 std::copy(v2.begin(), v2.end(), std::back_inserter(vsafe));
295 } else {
296 std::copy(v2.begin(), v2.end(), std::back_inserter(vsplit));
297 std::copy(v1.begin(), v1.end(), std::back_inserter(vsafe));
298 split1 = false;
299 }
300 } else if (v1.empty()) {
301 if (u2.empty()) {
302 std::copy(v2.begin(), v2.end(), std::back_inserter(vsplit));
303 } else {
304 assert(false);
305 return; // can this happen?
306 }
307 } else if (v2.empty()) {
308 if (u1.empty()) {
309 std::copy(v1.begin(), v1.end(), std::back_inserter(vsplit));
310 } else {
311 assert(false);
312 return;
313 }
314 }
315
316
317 // now we look for a split j s.t. vsplit[0:j] and vsplit[j:] + vsafe both have nonempty ec
318 int j = vsplit.size()-1;
319 while (!vsplit.empty()) {
320 auto x = vsplit.back();
321 vsplit.pop_back();
322 vsafe.push_back(x);
323
324 auto ut1 = simpleIntersect(index,vsplit);
325 auto ut2 = simpleIntersect(index,vsafe);
326
327 if (!ut1.empty() && !ut2.empty()) {
328 std::pair<int,int> p1,p2;
329 std::string st1,st2;
330 if (u1.empty()) {
331 st1 = s1.substr(0,vsafe.back().second);
332 st2 = s2;
333 } else {
334 st1 = s1;
335 st2 = s2.substr(0,vsafe.back().second);
336 }
337 if (checkUnionIntersection(index, st1, st2, p1, p2)) { // need to check this more carefully
338 o << "SPLIT\t";
339 o << n1 << "\t" << s1 << "\t" << p1.first << "," << p1.second << "\t";
340 // what to put as info?
341 if (paired) {
342 o << n2 << "\t" << s2 << "\t" << p2.first << "," << p2.second << "\t";
343 } else {
344 o << "\t\t\t";
345 }
346 o << "splitat=";
347 if (u1.empty()) {
348 o << "(0,";
349 } else {
350 o << "(1,";
351 }
352 o << vsafe.back().second << ")\t";
353 // fix this
354 if (split1) {
355 printTranscripts(index, o, s1, vsplit, ut1); o << "\t"; printTranscripts(index, o, s2, vsafe, ut2);
356 } else {
357 printTranscripts(index, o, s1, vsafe, ut2); o << "\t"; printTranscripts(index, o, s2, vsplit, ut1);
358 }
359 mp.outputFusion(o);
360 return;
361 }
362 } else {
363 j--;
364 }
365 }
366
367 return;
368
369 }
370