1 #include <sstream>
2 #include <set>
3 
4 /** -- fusion functions -- **/
5 
6 /**
7 FUSION output is written to OUT_DIR/fusion.txt where 'OUT_DIR' is the specified
8 output directory for kallisto. fusion.txt is a tab separated text file with 10 columns
9 TYPE  NAME1  SEQ1  KPOS1  NAME2  SEQ2  KPOS2  INFO  POS1  POS2
10 
11 TYPE can be one of SPLIT or PAIR, NAME1 and NAME2 are the sequence identifiers.
12 SEQ1 and SEQ2 are the raw read sequences identifie as overlapping the potential fusion.
13 KPOS1 and KPOS2 are the positions within the reads where the k-mers match.
14 
15 PAIRs are readpairs where fragment doesn't map, but each read
16 maps independently, potentially the fusion breakpoint is within the fragment, but not sequenced.
17 
18 SPLITs are reads where the fusion breakpoint appears within one of the reads
19 
20 INFO is NA for PAIR but of the form splitat=(a,b)
21 where a=0,1 indicating whether the first or the second pair is split and b is the 0-based position of the
22 split within the read.
23 
24 POS1 and POS2 are comma separated lists of locations of SEQ1 and SEQ2 respectively. Each position is of
25 the form (tx_name,pos,strand) where pos is 0-based from the start of the transcript tx_name and strand
26 is either FW or RE depending on whether the read aligns to the forward or reverse of the transcript.
27 **/
28 
printTranscripts(const KmerIndex & index,std::stringstream & o,const std::string s,const std::vector<std::pair<KmerEntry,int>> & v,const std::vector<int> u)29 void printTranscripts(const KmerIndex& index, std::stringstream& o, const std::string s,
30   const std::vector<std::pair<KmerEntry,int>>& v, const std::vector<int> u) {
31 
32   Kmer km;
33   KmerEntry val;
34   int p;
35 
36   // find first mapping k-mer
37   if (!v.empty()) {
38     p = findFirstMappingKmer(v,val);
39     km = Kmer((s.c_str()+p));
40   }
41 
42 
43   for (int i = 0; i < u.size(); i++) {
44     int tr = u[i];
45     if (i > 0) {
46       o << ";";
47     }
48     std::pair<int, bool> xp = index.findPosition(tr, km, val, p);
49     o << "(" << index.target_names_[tr] << "," << xp.first << ",";
50     if (xp.second) {
51       o << "FW)";
52     } else {
53       o << "RC)";
54     }
55   }
56 }
57 
simpleIntersect(const KmerIndex & index,const std::vector<std::pair<KmerEntry,int>> & v)58 std::vector<int> simpleIntersect(const KmerIndex& index, const std::vector<std::pair<KmerEntry,int>>& v) {
59   if (v.empty()) {
60     return {};
61   }
62   int ec = index.dbGraph.ecs[v[0].first.contig];
63   int lastEC = ec;
64   std::vector<int> u = index.ecmap[ec];
65 
66   for (int i = 1; i < v.size(); i++) {
67     if (v[i].first.contig != v[i-1].first.contig) {
68       ec = index.dbGraph.ecs[v[i].first.contig];
69       if (ec != lastEC) {
70         u = index.intersect(ec, u);
71         lastEC = ec;
72         if (u.empty()) {
73           return u;
74         }
75       }
76     }
77   }
78   return u;
79 }
80 
81 
checkMapability(const KmerIndex & index,const std::string & s,const std::vector<std::pair<KmerEntry,int>> & v,std::vector<int> & u)82 bool checkMapability(const KmerIndex& index, const std::string &s, const std::vector<std::pair<KmerEntry,int>>& v, std::vector<int> &u) {
83   const int maxMismatch = 2;
84   const int maxSoftclip = 5;
85 
86   Kmer km;
87   KmerEntry val;
88   int p;
89 
90   if (!v.empty()) {
91     p = findFirstMappingKmer(v,val);
92     km = Kmer(s.c_str()+p);
93   } else {
94     return false;
95   }
96 
97   std::vector<int> vtmp; vtmp.reserve(u.size());
98 
99   for (auto tr : u) {
100     auto trpos = index.findPosition(tr, km, val, p);
101     int tpos = (int)trpos.first;
102     int sz = (int)s.size();
103     bool add = true;
104     if (trpos.second) {
105       if (tpos < 1 || tpos + sz - 1 > index.target_seqs_[tr].size()) {
106         add = false;
107       } else {
108         //std::cout << index.target_seqs_[tr].substr(tpos,sz) << std::endl;
109         //std::cout << s << std::endl;
110         int mis = 0;
111         for (int i = 0; i < sz - maxSoftclip; i++) {
112           if (index.target_seqs_[tr][tpos-1 + i] != s[i]) {
113             ++mis;
114             if (mis > maxMismatch) {
115               break;
116             }
117           }
118         }
119         add = (mis <= maxMismatch);
120       }
121     }  else {
122       if (tpos > index.target_seqs_[tr].size() || tpos - sz < 1) {
123         add = false;
124       } else {
125         std::string rs = revcomp(s);
126         //std::cout << index.target_seqs_[tr].substr(tpos - sz, sz) << std::endl;
127         //std::cout << rs << std::endl;
128         int mis = 0;
129         for (int i = sz-1; i >= maxSoftclip; i--) {
130           if (index.target_seqs_[tr][tpos-sz+i] != rs[sz]) {
131             ++mis;
132             if (mis > maxMismatch) {
133               break;
134             }
135           }
136         }
137         add = (mis <= maxMismatch);
138       }
139     }
140 
141     if (add) {
142       vtmp.push_back(tr);
143     }
144 
145   }
146 
147 
148   if (vtmp.empty()) {
149     return false;
150   }
151 
152   if (vtmp.size() < u.size()) {
153     u = vtmp; // copy
154   }
155 
156   return true;
157 
158 }
159 
160 // returns true if the intersection of the union of EC classes for v1 and v2 respectively is empty
checkUnionIntersection(const KmerIndex & index,const std::string & s1,const std::string & s2,std::pair<int,int> & p1,std::pair<int,int> & p2)161 bool checkUnionIntersection(const KmerIndex& index, const std::string &s1, const std::string &s2, std::pair<int,int> &p1, std::pair<int,int> &p2) {
162 //const std::vector<std::pair<KmerEntry,int>>& v1, const std::vector<std::pair<KmerEntry,int>>& v2) {
163 
164   std::set<int> su1,su2;
165 
166   auto union_set = [&](const std::string &s, std::pair<int,int> &p) {
167     p = {-1,-1};
168     std::set<int> su;
169 
170     KmerIterator kit(s.c_str()), kit_end;
171     int lastEC = -1;
172     for (int i = 0; kit != kit_end; ++i,++kit) {
173       auto search = index.kmap.find(kit->first.rep());
174       if (search != index.kmap.end()) {
175         if (p.first == -1) {
176           p.first = kit->second;
177           p.second = p.first +1;
178         } else {
179           if (p.second + 1 == kit->second) {
180             p.second++;
181           }
182         }
183         const KmerEntry &val = search->second;
184         int ec = index.dbGraph.ecs[val.contig];
185         if (ec != -1 && ec != lastEC) {
186           su.insert(index.ecmap[ec].begin(), index.ecmap[ec].end());
187           lastEC = ec;
188         }
189       }
190     }
191 
192     /*
193     int ec = index.dbGraph.ecs[v[0].first.contig];
194     int lastEC = ec;
195     //const std::vector<int> &u = index.ecmap[ec];
196     s.insert(index.ecmap[ec].begin(), index.ecmap[ec].end());
197     for (int i = 1; i < v.size(); i++) {
198       if (v[i].first.contig != v[i-1].first.contig) {
199         ec = index.dbGraph.ecs[v[i].first.contig];
200         if (ec != lastEC) {
201           s.insert(index.ecmap[ec].begin(), index.ecmap[ec].end());
202           lastEC = ec;
203         }
204       }
205     }
206     */
207     return su;
208   };
209 
210   su1 = union_set(s1,p1);
211   su2 = union_set(s2,p2);
212 
213   if (su1.empty() || su2.empty()) {
214     return false; // TODO, decide on this
215   }
216 
217   if (su2.size() < su1.size()) {
218     swap(su1,su2);
219   }
220   for (auto x : su1) {
221     if (su2.count(x) != 0) {
222       return false;
223     }
224   }
225   return true;
226 }
227 
228 
searchFusion(const KmerIndex & index,const ProgramOptions & opt,const MinCollector & tc,MasterProcessor & mp,int ec,const std::string & n1,const std::string & s1,std::vector<std::pair<KmerEntry,int>> & v1,const std::string & n2,const std::string & s2,std::vector<std::pair<KmerEntry,int>> & v2,bool paired)229 void searchFusion(const KmerIndex &index, const ProgramOptions& opt,
230   const MinCollector& tc, MasterProcessor& mp, int ec,
231   const std::string &n1, const std::string &s1, std::vector<std::pair<KmerEntry,int>> &v1,
232   const std::string &n2, const std::string &s2, std::vector<std::pair<KmerEntry,int>> &v2, bool paired) {
233 
234   bool partialMap = false;
235   if (ec != -1) {
236     partialMap = true;
237   }
238   std::stringstream o;
239 
240   // no mapping information
241   if (v1.empty() && v2.empty()) {
242     return; // consider splitting in case either is empty
243   }
244 
245 
246   auto u1 = simpleIntersect(index, v1);
247   auto u2 = simpleIntersect(index, v2);
248 
249   // discordant pairs
250   if (!v1.empty() && !v2.empty()) {
251     if (!u1.empty() && !u2.empty()) {
252       std::pair<int,int> p1,p2;
253       if (checkUnionIntersection(index, s1, s2, p1, p2)) {
254         // each pair maps to either end
255         // each pair maps to either end
256         o << "PAIR\t";
257         o << n1 << "\t";
258         o << s1 << "\t";
259         o << p1.first << "," << p1.second << "\t";
260         if (paired) {
261           o << n2 << "\t";
262           o << s2 << "\t";
263           o << p2.first << "," << p2.second << "\t\tNA\t";
264         } else {
265           o << "\t\t\tNA\t";
266         }
267         printTranscripts(index, o, s1, v1, u1); o << "\t"; printTranscripts(index, o, s2, v2, u2);
268         mp.outputFusion(o);
269         return;
270       }
271     }
272   }
273 
274   if ((v1.empty() && !u2.empty()) ||  (v2.empty() && !u1.empty())) {
275     // read was trimmed
276     partialMap = true;
277     return;
278   }
279 
280   // ok so ec == -1 and not both v1 and v2 are empty
281   // exactly one of u1 and u2 are empty
282   std::vector<std::pair<KmerEntry,int>> vsafe, vsplit;
283   // sort v1 and v2 by read position
284   auto vsorter =  [&](std::pair<KmerEntry, int> a, std::pair<KmerEntry, int> b) {
285     return a.second < b.second;
286   };
287 
288   std::sort(v1.begin(), v1.end(), vsorter);
289   std::sort(v2.begin(), v2.end(), vsorter);
290   bool split1 = true;
291   if (!v1.empty() && !v2.empty()) {
292     if (u1.empty()) {
293       std::copy(v1.begin(), v1.end(), std::back_inserter(vsplit));
294       std::copy(v2.begin(), v2.end(), std::back_inserter(vsafe));
295     } else {
296       std::copy(v2.begin(), v2.end(), std::back_inserter(vsplit));
297       std::copy(v1.begin(), v1.end(), std::back_inserter(vsafe));
298       split1 = false;
299     }
300   } else if (v1.empty()) {
301     if (u2.empty()) {
302       std::copy(v2.begin(), v2.end(), std::back_inserter(vsplit));
303     } else {
304       assert(false);
305       return; // can this happen?
306     }
307   } else if (v2.empty()) {
308     if (u1.empty()) {
309       std::copy(v1.begin(), v1.end(), std::back_inserter(vsplit));
310     } else {
311       assert(false);
312       return;
313     }
314   }
315 
316 
317   // now we look for a split j s.t. vsplit[0:j] and vsplit[j:] + vsafe both have nonempty ec
318   int j = vsplit.size()-1;
319   while (!vsplit.empty()) {
320     auto x = vsplit.back();
321     vsplit.pop_back();
322     vsafe.push_back(x);
323 
324     auto ut1 = simpleIntersect(index,vsplit);
325     auto ut2 = simpleIntersect(index,vsafe);
326 
327     if (!ut1.empty() && !ut2.empty()) {
328       std::pair<int,int> p1,p2;
329       std::string st1,st2;
330       if (u1.empty()) {
331         st1 = s1.substr(0,vsafe.back().second);
332         st2 = s2;
333       } else {
334         st1 = s1;
335         st2 = s2.substr(0,vsafe.back().second);
336       }
337       if (checkUnionIntersection(index, st1, st2, p1, p2)) { // need to check this more carefully
338         o << "SPLIT\t";
339         o << n1 << "\t" << s1 << "\t" << p1.first << "," << p1.second << "\t";
340         // what to put as info?
341         if (paired) {
342           o << n2 << "\t" << s2 << "\t" << p2.first << "," << p2.second << "\t";
343         } else {
344           o << "\t\t\t";
345         }
346         o << "splitat=";
347         if (u1.empty()) {
348           o << "(0,";
349         } else {
350           o << "(1,";
351         }
352         o << vsafe.back().second << ")\t";
353         // fix this
354         if (split1) {
355           printTranscripts(index, o, s1, vsplit, ut1); o << "\t"; printTranscripts(index, o, s2, vsafe, ut2);
356         } else {
357           printTranscripts(index, o, s1, vsafe, ut2); o << "\t"; printTranscripts(index, o, s2, vsplit, ut1);
358         }
359         mp.outputFusion(o);
360         return;
361       }
362     } else {
363       j--;
364     }
365   }
366 
367   return;
368 
369 }
370