1 #ifndef __MUMMER_POSTNUC_H__
2 #define __MUMMER_POSTNUC_H__
3
4 #include <string>
5 #include <vector>
6 #include <iostream>
7 #include <limits>
8 #include <cstring>
9 #include <memory>
10 #include <iomanip>
11 #include <atomic>
12
13 #include "tigrinc.hh"
14 #include "sw_align.hh"
15
16
17 namespace mummer {
18 namespace postnuc {
19 static const signed char FORWARD_CHAR = 1;
20 static const signed char REVERSE_CHAR = -1;
21
22 //------------------------------------------------------ Type Definitions ----//
23 enum LineType
24 //-- The type of input line from <stdin>
25 {
26 NO_LINE, HEADER_LINE, MATCH_LINE
27 };
28
29
30 struct Match
31 //-- An exact match between two sequences A and B
32 {
33 long int sA, sB, len; // start coordinate in A, in B and the length
34 };
35
36
37 struct Cluster
38 //-- An ordered list of matches between two sequences A and B
39 {
40 bool wasFused; // have the cluster matches been fused yet?
41 signed char dirB; // the query sequence direction
42 // FORWARD_CHAR or REVERSE_CHAR
43 std::vector<Match> matches; // the ordered set of matches in the cluster
44 Cluster() = default;
Clustermummer::postnuc::Cluster45 Cluster(char dir) : wasFused(false), dirB(dir) { }
46 };
47
48 // A FastaRecord must respond to seq(), len() and Id().
49 template<typename FastaRecord>
50 struct Synteny
51 //-- An ordered list of clusters between two sequences A and B (B is fixed)
52 {
53 const FastaRecord* AfP; // a pointer to the reference sequence record
54 std::vector<Cluster> clusters; // the ordered set of clusters between A and B
55 Synteny() = default;
Syntenymummer::postnuc::Synteny56 Synteny(const FastaRecord* Af) : AfP(Af) { }
operator <mummer::postnuc::Synteny57 bool operator<(const Synteny& rhs) const { return *AfP < *rhs.AfP; }
58 };
59
60 struct Alignment
61 //-- An alignment object between two sequences A and B
62 {
63 signed char dirB; // the query sequence direction
64 long int sA, sB, eA, eB; // the start in A, B and the end in A, B
65 std::vector<long int> delta; // the delta values, with NO zero at the end
66 long int deltaApos; // sum of abs(deltas) - #of negative deltas
67 // trust me, it is a very helpful value
68 long int Errors, SimErrors, NonAlphas; // errors, similarity errors, nonalphas
69
Alignmentmummer::postnuc::Alignment70 Alignment(const Match& m, const signed char dir)
71 : dirB(dir)
72 , sA(m.sA)
73 , sB(m.sB)
74 , eA(m.sA + m.len - 1)
75 , eB(m.sB + m.len - 1)
76 , deltaApos(0)
77 , Errors(0)
78 , SimErrors(0)
79 , NonAlphas(0)
80 { }
81 Alignment() = default;
82 Alignment(const Alignment& rhs) = default;
83 Alignment(Alignment&& rhs) = default;
84 Alignment& operator=(const Alignment& rhs) = default;
85
86 // Number of bases in alignment (in reference), counting deletions.
totalmummer::postnuc::Alignment87 long total() const {
88 return std::abs(eA - sA) + 1 + std::count_if(delta.cbegin(), delta.cend(), [](long x) { return x < 0; });
89 }
90
identitymummer::postnuc::Alignment91 inline double identity(const long t) const { return (double)(t - Errors) / t; }
identitymummer::postnuc::Alignment92 inline double identity() const { return identity(total()); }
similaritymummer::postnuc::Alignment93 inline double similarity(const long t) const { return (double)(t - SimErrors) / t; }
similaritymummer::postnuc::Alignment94 inline double similarity() const { return similarity(total()); }
stopitymummer::postnuc::Alignment95 inline double stopity(const long t) const { return (double)NonAlphas / (2 * t); }
stopitymummer::postnuc::Alignment96 inline double stopity() const { return stopity(total()); }
97
98 struct stats_type {
99 double identity;
100 double similarity;
101 double stopity;
102 };
statsmummer::postnuc::Alignment103 stats_type stats() const {
104 const long t = total();
105 return { identity(t), similarity(t), stopity(t) };
106 }
107 };
108
109 // Iterator over the error
110
111 // The type of error. INSERTION and DELETION apply to the
112 // reference. INSERTION means an there is an extra base in the
113 // reference., DELETION means there is one less base in the reference.
114 enum error_type { NONE, INSERTION, DELETION, MISMATCH };
115 struct error_description_type {
116 error_type type; // The type of the error
117 const char* ref; // Pointer in the reference of the error
118 const char* qry; // Idem qry
119 long dst; // Distance to previous indel or start of sequences
120
121 // For a mismatch, *ref != *qry. For an INSERTION, qry is the
122 // position in the qry sequence where to insert the extra base,
123 // which is *ref. Idem for DELETION.
124 //
125 // dst is the distance to the previous indel. So dst >= 1 and dst-1
126 // is equal to the number of matching and mismatching bases since the start or the
127 // previous indel.
128 };
129 class error_iterator_type : public std::iterator<std::input_iterator_tag, error_description_type> {
130 const Alignment& m_al;
131 error_description_type m_error;
132 const char* m_ref_end;
133 size_t m_k; // index in delta
134 public:
135 // Create an iterator at beginning of error. ref and qry pointer
136 // points char before the start of sequence (i.e. as in 1-base
137 // indexing).
error_iterator_type(const Alignment & al,const char * ref,const char * qry,size_t qry_len)138 error_iterator_type(const Alignment& al, const char* ref, const char* qry, size_t qry_len)
139 : m_al(al)
140 , m_error{ NONE, ref + al.sA - 1, qry + (al.dirB == 1 ? al.sB - 1 : (long)qry_len - al.sB + 2), 0 }
141 , m_ref_end(ref + al.eA + 1)
142 , m_k(0)
143 { ++*this; }
144 // Create an iterator at end
error_iterator_type(const Alignment & al,const char * ref)145 error_iterator_type(const Alignment& al, const char* ref)
146 : m_al(al)
147 , m_error{NONE, ref + al.eA + 1, nullptr, 0}
148 , m_ref_end(nullptr)
149 , m_k(0)
150 { }
comp(char b)151 static char comp(char b) {
152 switch(b) {
153 case 'a': return 't'; case 'A': return 'T';
154 case 'c': return 'g'; case 'C': return 'G';
155 case 'g': return 'c'; case 'G': return 'C';
156 case 't': return 'a'; case 'T': return 'A';
157 default: return 'n';
158 }
159 }
160
operator ==(const error_iterator_type & rhs) const161 bool operator==(const error_iterator_type& rhs) const { return m_error.ref == rhs.m_error.ref; }
operator !=(const error_iterator_type & rhs) const162 bool operator!=(const error_iterator_type& rhs) const { return m_error.ref != rhs.m_error.ref; }
operator *() const163 const error_description_type& operator*() const { return m_error; }
operator ->() const164 const error_description_type* operator->() const { return &m_error; }
165 error_iterator_type& operator++();
operator ++(int)166 error_iterator_type operator++(int) {
167 error_iterator_type res(*this);
168 ++*this;
169 return res;
170 }
171 };
172
173 std::ostream& operator<<(std::ostream& os, const Alignment& al);
174
175 struct AscendingClusterSort
176 //-- For sorting clusters in ascending order of their sA coordinate
177 {
operator ()mummer::postnuc::AscendingClusterSort178 bool operator() (const Cluster & pA, const Cluster & pB)
179 {
180 return ( pA.matches.front().sA < pB.matches.front().sA );
181 }
182 };
183
184 struct merge_syntenys {
185 const bool DO_DELTA;
186 const bool DO_EXTEND;
187 const bool TO_SEQEND;
188 const bool DO_SHADOWS;
189 const sw_align::aligner_buffer aligner;
190
merge_syntenysmummer::postnuc::merge_syntenys191 merge_syntenys(bool dd, bool de, bool ts, bool ds)
192 : DO_DELTA(dd)
193 , DO_EXTEND(de)
194 , TO_SEQEND(ts)
195 , DO_SHADOWS(ds)
196 , aligner()
197 { }
198
merge_syntenysmummer::postnuc::merge_syntenys199 merge_syntenys(bool dd, bool de, bool ts, bool ds, int break_len, int banding, int matrix_type)
200 : DO_DELTA(dd)
201 , DO_EXTEND(de)
202 , TO_SEQEND(ts)
203 , DO_SHADOWS(ds)
204 , aligner(break_len, banding, matrix_type)
205 { }
206
207 // Process all syntenys in a container
208 template<typename Container, typename FR2, typename ClustersOut, typename MatchesOut>
209 void processSyntenys_each(Container& Syntenys, const FR2& Bf,
210 ClustersOut clusters, MatchesOut matches) const;
211 template<typename Container, typename FR2, typename MatchesOut>
processSyntenys_eachmummer::postnuc::merge_syntenys212 void processSyntenys_each(Container& Syntenys, const FR2& Bf,
213 MatchesOut matches) const {
214 processSyntenys_each(Syntenys, Bf, [](const Container& s, const FR2& Bf) { },
215 matches);
216 }
217
218 // Process all syntenys in a container in parallel
219 template<typename Container, typename FR2, typename ClustersOut, typename MatchesOut>
220 void processSyntenys_long_each(Container& Syntenys, const FR2& Bf,
221 ClustersOut clusters, MatchesOut matches) const;
222 // Process all syntenys in a container in parallel
223 template<typename Container, typename FR2, typename MatchesOut>
processSyntenys_long_eachmummer::postnuc::merge_syntenys224 void processSyntenys_long_each(Container& Syntenys, const FR2& Bf,
225 MatchesOut matches) const {
226 processSyntenys_long_each(Syntenys, Bf, [](const Container& s, const FR2& Bf) { },
227 matches);
228 }
229
230 bool extendBackward(std::vector<Alignment> & Alignments, std::vector<Alignment>::iterator CurrAp,
231 std::vector<Alignment>::iterator TargetAp, const char * A, const char * B) const;
232
233 void extendClusters(std::vector<Cluster> & Clusters,
234 const char* Aseq, const long Alen, const char* Bseq, const long Blen,
235 std::vector<Alignment>& Alignments) const;
236
extendClustersmummer::postnuc::merge_syntenys237 std::vector<Alignment> extendClusters(std::vector<Cluster> & Clusters,
238 const char* Aseq, const long Alen, const char* Bseq, const long Blen) const {
239 std::vector<Alignment> res;
240 extendClusters(Clusters, Aseq, Alen, Bseq, Blen, res);
241 return res;
242 }
243
244 protected:
245 bool extendForward(std::vector<Alignment>::iterator Ap, const char * A, long int targetA,
246 const char * B, long int targetB, unsigned int m_o) const;
247
248 std::vector<Cluster>::iterator getForwardTargetCluster(std::vector<Cluster> & Clusters, std::vector<Cluster>::iterator CurrCp,
249 long int & targetA, long int & targetB) const;
250 std::vector<Alignment>::iterator getReverseTargetAlignment(std::vector<Alignment> & Alignments,
251 std::vector<Alignment>::iterator CurrAp) const;
252 void parseDelta(std::vector<Alignment> & Alignments,
253 const char* Aseq, const char* Bseq, const long Blen) const;
254 };
255
256 //-- Helper functions
ignore_line(std::istream & is)257 inline void ignore_line(std::istream& is) {
258 is.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
259 }
260
261 //------------------------------------------------- Function Declarations ----//
262 bool Read_Sequence(std::istream& is, std::string& T, std::string& name);
263
264 void printDeltaAlignments(const std::vector<Alignment>& Alignments,
265 const std::string& AId, const long Alen,
266 const std::string& BId, const long Blen,
267 std::ostream& DeltaFile, const long minLen = 0);
268
269 template<typename FastaRecord>
printDeltaAlignments(const std::vector<Alignment> & Alignments,const FastaRecord & Af,const FastaRecord & Bf,std::ostream & DeltaFile)270 inline void printDeltaAlignments(const std::vector<Alignment>& Alignments,
271 const FastaRecord& Af, const FastaRecord& Bf,
272 std::ostream& DeltaFile) {
273 printDeltaAlignments(Alignments, Af.Id(), Af.len(), Bf.Id(), Bf.len(), DeltaFile);
274 }
275
276 // Print alignments in SAM format
277 template<typename FR1, typename FR2>
278 void printSAMAlignments(const std::vector<Alignment>& Alignments,
279 const FR1& A, const FR2& B,
280 std::ostream& SAMFile, bool long_format, const long minLen = 0);
281 std::string createCIGAR(const std::vector<long int>& ds, long int start, long int end, long int len, bool hard_clip = false);
282 std::string createMD(const Alignment& al, const char* ref,
283 const char* qry, size_t qry_len);
284
285 template<typename FastaRecord>
286 void printSyntenys(const std::vector<Synteny<FastaRecord> >& Syntenys, const FastaRecord& Bf, std::ostream& ClusterFile);
287
288 bool isShadowedCluster
289 (std::vector<Cluster>::const_iterator CurrCp,
290 const std::vector<Alignment> & Alignments, std::vector<Alignment>::const_iterator Ap);
291
292 void __parseAbort
293 (const char *msg, const char* file, size_t line);
__parseAbort(const std::string & s,const char * file,size_t line)294 inline void __parseAbort(const std::string& s, const char* file, size_t line) {
295 __parseAbort(s.c_str(), file, line);
296 }
297
298 #define parseAbort(msg) __parseAbort(msg, __FILE__, __LINE__);
299
revC(long int Coord,long int Len)300 inline long int revC
301 (long int Coord, long int Len)
302 // Reverse complement the given coordinate for the given length.
303
304 {
305 assert (Len - Coord + 1 > 0);
306 return (Len - Coord + 1);
307 }
308
309
310
311 //
312 // Implementation of templated methods
313 //
314 template<typename Container, typename FR2, typename ClustersOut, typename MatchesOut>
processSyntenys_each(Container & Syntenys,const FR2 & Bf,ClustersOut clusters,MatchesOut matches) const315 void merge_syntenys::processSyntenys_each(Container& Syntenys, const FR2& Bf,
316 ClustersOut clusters, MatchesOut matches) const
317
318 // For each syntenic region with clusters, extend the clusters to
319 // expand total alignment coverage. Only should be called once all
320 // the clusters for the contained syntenic regions have been stored
321 // in the data structure. Frees the memory used by the the syntenic
322 // regions once the output of extendClusters and flushSyntenys has
323 // been produced.
324
325 {
326 //-- For all the contained syntenys
327 for(auto& CurrSp : Syntenys) {
328 //-- If no clusters, ignore
329 if(CurrSp.clusters.empty()) continue;
330 //-- Extend clusters and create the alignment information
331 // alignments.clear();
332 std::vector<Alignment> alignments;
333 extendClusters (CurrSp.clusters, CurrSp.AfP->seq(), CurrSp.AfP->len(), Bf.seq(), Bf.len(), alignments);
334 //-- Output the alignment data to the delta file
335 matches(std::move(alignments), *CurrSp.AfP, Bf);
336 }
337
338 //-- Create the cluster information
339 clusters(Syntenys, Bf);
340 Syntenys.clear();
341 }
342
343 template<typename Container, typename FR2, typename ClustersOut, typename MatchesOut>
processSyntenys_long_each(Container & Syntenys,const FR2 & Bf,ClustersOut clusters,MatchesOut matches) const344 void merge_syntenys::processSyntenys_long_each(Container& Syntenys, const FR2& Bf,
345 ClustersOut clusters, MatchesOut matches) const
346
347 // For each syntenic region with clusters, extend the clusters to
348 // expand total alignment coverage. Only should be called once all
349 // the clusters for the contained syntenic regions have been stored
350 // in the data structure. Frees the memory used by the the syntenic
351 // regions once the output of extendClusters and flushSyntenys has
352 // been produced.
353
354 {
355 std::atomic<size_t> pos(0);
356 const auto end = Syntenys.end();
357
358 //#pragma omp parallel
359 {
360 auto CurrSp = Syntenys.begin();
361 size_t prev = 0;
362
363 while(true) {
364 const size_t npos = pos++;
365 for( ; prev < npos && CurrSp != end; ++prev, ++CurrSp) ; // Advance
366 if(CurrSp == end) break;
367 if(CurrSp->clusters.empty()) continue;
368 //-- Extend clusters and create the alignment information
369 // alignments.clear();
370 std::vector<Alignment> alignments;
371 extendClusters (CurrSp->clusters, CurrSp->AfP->seq(), CurrSp->AfP->len(), Bf.seq(), Bf.len(), alignments);
372 //-- Output the alignment data to the delta file
373 matches(std::move(alignments), *CurrSp->AfP, Bf);
374 }
375 }
376
377 //-- Create the cluster information
378 clusters(Syntenys, Bf);
379 Syntenys.clear();
380 }
381
382 template<typename FastaRecord>
printSyntenys(const std::vector<Synteny<FastaRecord>> & Syntenys,const FastaRecord & Bf,std::ostream & ClusterFile)383 void printSyntenys(const std::vector<Synteny<FastaRecord> > & Syntenys, const FastaRecord& Bf, std::ostream& ClusterFile)
384
385 // Simply output the synteny/cluster information generated by the mgaps
386 // program. However, now the coordinates reference their appropriate
387 // reference sequence, and the reference sequecne header is added to
388 // the appropriate lines. Free the memory used by Syntenys once the
389 // data is successfully output to the file.
390
391 {
392 if ( ClusterFile ) {
393 for(const auto& Sp : Syntenys) { // each syntenys
394 ClusterFile << '>' << Sp.AfP->Id() << ' ' << Bf.Id() << ' '
395 << Sp.AfP->len() << ' ' << Bf.len() << '\n';
396
397 for (const auto& Cp : Sp.clusters) { // each clusters
398 ClusterFile << std::setw(2) << FORWARD_CHAR << ' ' << std::setw(2) << Cp.dirB << '\n';
399
400 for (auto Mp = Cp.matches.cbegin( ); Mp != Cp.matches.cend( ); ++Mp ) { // each match
401 ClusterFile << std::setw(8) << Mp->sA << ' '
402 << std::setw(8) << (Cp.dirB == FORWARD_CHAR ? Mp->sB : revC(Mp->sB, Bf.len())) << ' '
403 << std::setw(6) << Mp->len;
404 if ( Mp != Cp.matches.cbegin( ) )
405 ClusterFile << std::setw(6) << (Mp->sA - (Mp - 1)->sA - (Mp - 1)->len) << ' '
406 << std::setw(6) << (Mp->sB - (Mp - 1)->sB - (Mp - 1)->len) << '\n';
407 else
408 ClusterFile << " - -\n";
409 }
410 }
411 }
412 }
413 }
414
415 template<typename FR1, typename FR2>
printSAMAlignments(const std::vector<Alignment> & Alignments,const FR1 & A,const FR2 & B,std::ostream & SAMFile,bool long_format,const long minLen)416 void printSAMAlignments(const std::vector<Alignment>& Alignments,
417 const FR1& A, const FR2& B,
418 std::ostream& SAMFile, bool long_format,
419 const long minLen) {
420 const char* mapq = Alignments.size() > 1 ? "\t10\t" : "\t30\t";
421 bool hard_clip = false;
422 for(const auto& Al : Alignments) {
423 if(std::abs(Al.eA - Al.sA) < minLen && std::abs(Al.eB - Al.sB) < minLen)
424 continue;
425 const bool fwd = Al.dirB == FORWARD_CHAR;
426 SAMFile << B.Id() << '\t'
427 << ((hard_clip ? 0x800 : 0) | (fwd ? 0 : 0x10)) << '\t'
428 << A.Id() << '\t' << Al.sA
429 << mapq
430 << createCIGAR(Al.delta, Al.sB, Al.eB, B.len(), hard_clip)
431 << "\t*\t0\t0\t";
432 if(long_format) {
433 const auto start = hard_clip ? Al.sB : 1;
434 const auto len = hard_clip ? Al.eB - start + 1 : B.len();
435 SAMFile.write(B.seq() + start, len);
436 } else {
437 SAMFile << '*';
438 }
439 SAMFile << "\t*\tNM:i:" << Al.Errors;
440 if(long_format)
441 SAMFile << "\tMD:Z:" << createMD(Al, A.seq(), B.seq(), B.len());
442 SAMFile << '\n';
443 hard_clip = true;
444 }
445 }
446
447
448 } // namespace postnuc
449 } // namespace mummer
450
451 #endif /* __MUMMER_POSTNUC_H__ */
452