1 #ifndef __MUMMER_POSTNUC_H__
2 #define __MUMMER_POSTNUC_H__
3 
4 #include <string>
5 #include <vector>
6 #include <iostream>
7 #include <limits>
8 #include <cstring>
9 #include <memory>
10 #include <iomanip>
11 #include <atomic>
12 
13 #include "tigrinc.hh"
14 #include "sw_align.hh"
15 
16 
17 namespace mummer {
18 namespace postnuc {
19 static const signed char FORWARD_CHAR = 1;
20 static const signed char REVERSE_CHAR = -1;
21 
22 //------------------------------------------------------ Type Definitions ----//
23 enum LineType
24 //-- The type of input line from <stdin>
25   {
26     NO_LINE, HEADER_LINE, MATCH_LINE
27   };
28 
29 
30 struct Match
31 //-- An exact match between two sequences A and B
32 {
33   long int sA, sB, len;      // start coordinate in A, in B and the length
34 };
35 
36 
37 struct Cluster
38 //-- An ordered list of matches between two sequences A and B
39 {
40   bool          wasFused;       // have the cluster matches been fused yet?
41   signed char   dirB;           // the query sequence direction
42                                 //      FORWARD_CHAR or REVERSE_CHAR
43   std::vector<Match> matches;        // the ordered set of matches in the cluster
44   Cluster() = default;
Clustermummer::postnuc::Cluster45   Cluster(char dir) : wasFused(false), dirB(dir) { }
46 };
47 
48 // A FastaRecord must respond to seq(), len() and Id().
49 template<typename FastaRecord>
50 struct Synteny
51 //-- An ordered list of clusters between two sequences A and B (B is fixed)
52 {
53   const FastaRecord*   AfP;     // a pointer to the reference sequence record
54   std::vector<Cluster> clusters; // the ordered set of clusters between A and B
55   Synteny() = default;
Syntenymummer::postnuc::Synteny56   Synteny(const FastaRecord* Af) : AfP(Af) { }
operator <mummer::postnuc::Synteny57   bool operator<(const Synteny& rhs) const { return *AfP < *rhs.AfP; }
58 };
59 
60 struct Alignment
61 //-- An alignment object between two sequences A and B
62 {
63   signed char      dirB;        // the query sequence direction
64   long int         sA, sB, eA, eB; // the start in A, B and the end in A, B
65   std::vector<long int> delta;       // the delta values, with NO zero at the end
66   long int         deltaApos;   // sum of abs(deltas) - #of negative deltas
67                                 //      trust me, it is a very helpful value
68   long int         Errors, SimErrors, NonAlphas; // errors, similarity errors, nonalphas
69 
Alignmentmummer::postnuc::Alignment70   Alignment(const Match& m, const signed char dir)
71     : dirB(dir)
72     , sA(m.sA)
73     , sB(m.sB)
74     , eA(m.sA + m.len - 1)
75     , eB(m.sB + m.len - 1)
76     , deltaApos(0)
77     , Errors(0)
78     , SimErrors(0)
79     , NonAlphas(0)
80   { }
81   Alignment() = default;
82   Alignment(const Alignment& rhs) = default;
83   Alignment(Alignment&& rhs) = default;
84   Alignment& operator=(const Alignment& rhs) = default;
85 
86   // Number of bases in alignment (in reference), counting deletions.
totalmummer::postnuc::Alignment87   long total() const {
88     return std::abs(eA - sA) + 1 + std::count_if(delta.cbegin(), delta.cend(), [](long x) { return x < 0; });
89   }
90 
identitymummer::postnuc::Alignment91   inline double identity(const long t) const { return (double)(t - Errors) / t; }
identitymummer::postnuc::Alignment92   inline double identity() const { return identity(total()); }
similaritymummer::postnuc::Alignment93   inline double similarity(const long t) const { return (double)(t - SimErrors) / t; }
similaritymummer::postnuc::Alignment94   inline double similarity() const { return similarity(total()); }
stopitymummer::postnuc::Alignment95   inline double stopity(const long t) const { return (double)NonAlphas / (2 * t); }
stopitymummer::postnuc::Alignment96   inline double stopity() const { return stopity(total()); }
97 
98   struct stats_type {
99     double identity;
100     double similarity;
101     double stopity;
102   };
statsmummer::postnuc::Alignment103   stats_type stats() const {
104     const long t = total();
105     return { identity(t), similarity(t), stopity(t) };
106   }
107 };
108 
109 // Iterator over the error
110 
111 // The type of error. INSERTION and DELETION apply to the
112 // reference. INSERTION means an there is an extra base in the
113 // reference., DELETION means there is one less base in the reference.
114 enum error_type { NONE, INSERTION, DELETION, MISMATCH };
115 struct error_description_type {
116   error_type  type;             // The type of the error
117   const char* ref;              // Pointer in the reference of the error
118   const char* qry;              // Idem qry
119   long        dst;              // Distance to previous indel or start of sequences
120 
121   // For a mismatch, *ref != *qry. For an INSERTION, qry is the
122   // position in the qry sequence where to insert the extra base,
123   // which is *ref. Idem for DELETION.
124   //
125   // dst is the distance to the previous indel. So dst >= 1 and dst-1
126   // is equal to the number of matching and mismatching bases since the start or the
127   // previous indel.
128 };
129 class error_iterator_type : public std::iterator<std::input_iterator_tag, error_description_type> {
130   const Alignment&       m_al;
131   error_description_type m_error;
132   const char*            m_ref_end;
133   size_t                 m_k;   // index in delta
134 public:
135   // Create an iterator at beginning of error. ref and qry pointer
136   // points char before the start of sequence (i.e. as in 1-base
137   // indexing).
error_iterator_type(const Alignment & al,const char * ref,const char * qry,size_t qry_len)138   error_iterator_type(const Alignment& al, const char* ref, const char* qry, size_t qry_len)
139     : m_al(al)
140     , m_error{ NONE, ref + al.sA - 1, qry + (al.dirB == 1 ? al.sB - 1 : (long)qry_len - al.sB + 2), 0 }
141     , m_ref_end(ref + al.eA + 1)
142     , m_k(0)
143   { ++*this; }
144   // Create an iterator at end
error_iterator_type(const Alignment & al,const char * ref)145   error_iterator_type(const Alignment& al, const char* ref)
146     : m_al(al)
147     , m_error{NONE, ref + al.eA + 1, nullptr, 0}
148     , m_ref_end(nullptr)
149     , m_k(0)
150   { }
comp(char b)151   static char comp(char b) {
152     switch(b) {
153     case 'a': return 't'; case 'A': return 'T';
154     case 'c': return 'g'; case 'C': return 'G';
155     case 'g': return 'c'; case 'G': return 'C';
156     case 't': return 'a'; case 'T': return 'A';
157     default: return 'n';
158     }
159   }
160 
operator ==(const error_iterator_type & rhs) const161   bool operator==(const error_iterator_type& rhs) const { return m_error.ref == rhs.m_error.ref; }
operator !=(const error_iterator_type & rhs) const162   bool operator!=(const error_iterator_type& rhs) const { return m_error.ref != rhs.m_error.ref; }
operator *() const163   const error_description_type& operator*() const { return m_error; }
operator ->() const164   const error_description_type* operator->() const { return &m_error; }
165   error_iterator_type& operator++();
operator ++(int)166   error_iterator_type operator++(int) {
167     error_iterator_type res(*this);
168     ++*this;
169     return res;
170   }
171 };
172 
173 std::ostream& operator<<(std::ostream& os, const Alignment& al);
174 
175 struct AscendingClusterSort
176 //-- For sorting clusters in ascending order of their sA coordinate
177 {
operator ()mummer::postnuc::AscendingClusterSort178   bool operator() (const Cluster & pA, const Cluster & pB)
179   {
180     return ( pA.matches.front().sA < pB.matches.front().sA );
181   }
182 };
183 
184 struct merge_syntenys {
185   const bool                     DO_DELTA;
186   const bool                     DO_EXTEND;
187   const bool                     TO_SEQEND;
188   const bool                     DO_SHADOWS;
189   const sw_align::aligner_buffer aligner;
190 
merge_syntenysmummer::postnuc::merge_syntenys191   merge_syntenys(bool dd, bool de, bool ts, bool ds)
192     : DO_DELTA(dd)
193     , DO_EXTEND(de)
194     , TO_SEQEND(ts)
195     , DO_SHADOWS(ds)
196     , aligner()
197   { }
198 
merge_syntenysmummer::postnuc::merge_syntenys199   merge_syntenys(bool dd, bool de, bool ts, bool ds, int break_len, int banding, int matrix_type)
200     : DO_DELTA(dd)
201     , DO_EXTEND(de)
202     , TO_SEQEND(ts)
203     , DO_SHADOWS(ds)
204     , aligner(break_len, banding, matrix_type)
205   { }
206 
207   // Process all syntenys in a container
208   template<typename Container, typename FR2, typename ClustersOut, typename MatchesOut>
209   void processSyntenys_each(Container& Syntenys, const FR2& Bf,
210                             ClustersOut clusters, MatchesOut matches) const;
211   template<typename Container, typename FR2, typename MatchesOut>
processSyntenys_eachmummer::postnuc::merge_syntenys212   void processSyntenys_each(Container& Syntenys, const FR2& Bf,
213                             MatchesOut matches) const {
214     processSyntenys_each(Syntenys, Bf, [](const Container& s, const FR2& Bf) { },
215                          matches);
216   }
217 
218   // Process all syntenys in a container in parallel
219   template<typename Container, typename FR2, typename ClustersOut, typename MatchesOut>
220   void processSyntenys_long_each(Container& Syntenys, const FR2& Bf,
221                                  ClustersOut clusters, MatchesOut matches) const;
222   // Process all syntenys in a container in parallel
223   template<typename Container, typename FR2, typename MatchesOut>
processSyntenys_long_eachmummer::postnuc::merge_syntenys224   void processSyntenys_long_each(Container& Syntenys, const FR2& Bf,
225                                  MatchesOut matches) const {
226     processSyntenys_long_each(Syntenys, Bf, [](const Container& s, const FR2& Bf) { },
227                               matches);
228   }
229 
230   bool extendBackward(std::vector<Alignment> & Alignments, std::vector<Alignment>::iterator CurrAp,
231                       std::vector<Alignment>::iterator TargetAp, const char * A, const char * B) const;
232 
233   void extendClusters(std::vector<Cluster> & Clusters,
234                       const char* Aseq, const long Alen, const char* Bseq, const long Blen,
235                       std::vector<Alignment>& Alignments) const;
236 
extendClustersmummer::postnuc::merge_syntenys237   std::vector<Alignment> extendClusters(std::vector<Cluster> & Clusters,
238                                         const char* Aseq, const long Alen, const char* Bseq, const long Blen) const {
239     std::vector<Alignment> res;
240     extendClusters(Clusters, Aseq, Alen, Bseq, Blen, res);
241     return res;
242   }
243 
244 protected:
245   bool extendForward(std::vector<Alignment>::iterator Ap, const char * A, long int targetA,
246                      const char * B, long int targetB, unsigned int m_o) const;
247 
248   std::vector<Cluster>::iterator getForwardTargetCluster(std::vector<Cluster> & Clusters, std::vector<Cluster>::iterator CurrCp,
249                                                          long int & targetA, long int & targetB) const;
250   std::vector<Alignment>::iterator getReverseTargetAlignment(std::vector<Alignment> & Alignments,
251                                                              std::vector<Alignment>::iterator CurrAp) const;
252   void parseDelta(std::vector<Alignment> & Alignments,
253                   const char* Aseq, const char* Bseq, const long Blen) const;
254 };
255 
256 //-- Helper functions
ignore_line(std::istream & is)257 inline void ignore_line(std::istream& is) {
258   is.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
259 }
260 
261 //------------------------------------------------- Function Declarations ----//
262 bool Read_Sequence(std::istream& is, std::string& T, std::string& name);
263 
264 void printDeltaAlignments(const std::vector<Alignment>& Alignments,
265                           const std::string& AId, const long Alen,
266                           const std::string& BId, const long Blen,
267                           std::ostream& DeltaFile, const long minLen = 0);
268 
269 template<typename FastaRecord>
printDeltaAlignments(const std::vector<Alignment> & Alignments,const FastaRecord & Af,const FastaRecord & Bf,std::ostream & DeltaFile)270 inline void printDeltaAlignments(const std::vector<Alignment>& Alignments,
271                           const FastaRecord& Af, const FastaRecord& Bf,
272                           std::ostream& DeltaFile) {
273   printDeltaAlignments(Alignments, Af.Id(), Af.len(), Bf.Id(), Bf.len(), DeltaFile);
274 }
275 
276 // Print alignments in SAM format
277 template<typename FR1, typename FR2>
278 void printSAMAlignments(const std::vector<Alignment>& Alignments,
279                         const FR1& A, const FR2& B,
280                         std::ostream& SAMFile, bool long_format, const long minLen = 0);
281 std::string createCIGAR(const std::vector<long int>& ds, long int start, long int end, long int len, bool hard_clip = false);
282 std::string createMD(const Alignment& al, const char* ref,
283                      const char* qry, size_t qry_len);
284 
285 template<typename FastaRecord>
286 void printSyntenys(const std::vector<Synteny<FastaRecord> >& Syntenys, const FastaRecord& Bf, std::ostream& ClusterFile);
287 
288 bool isShadowedCluster
289 (std::vector<Cluster>::const_iterator CurrCp,
290  const std::vector<Alignment> & Alignments, std::vector<Alignment>::const_iterator Ap);
291 
292 void __parseAbort
293 (const char *msg, const char* file, size_t line);
__parseAbort(const std::string & s,const char * file,size_t line)294 inline void __parseAbort(const std::string& s, const char* file, size_t line) {
295   __parseAbort(s.c_str(), file, line);
296 }
297 
298 #define parseAbort(msg) __parseAbort(msg, __FILE__, __LINE__);
299 
revC(long int Coord,long int Len)300 inline long int revC
301 (long int Coord, long int Len)
302   //  Reverse complement the given coordinate for the given length.
303 
304 {
305   assert (Len - Coord + 1 > 0);
306   return (Len - Coord + 1);
307 }
308 
309 
310 
311 //
312 // Implementation of templated methods
313 //
314 template<typename Container, typename FR2, typename ClustersOut, typename MatchesOut>
processSyntenys_each(Container & Syntenys,const FR2 & Bf,ClustersOut clusters,MatchesOut matches) const315 void merge_syntenys::processSyntenys_each(Container& Syntenys, const FR2& Bf,
316                                           ClustersOut clusters, MatchesOut matches) const
317 
318 //  For each syntenic region with clusters, extend the clusters to
319 //  expand total alignment coverage. Only should be called once all
320 //  the clusters for the contained syntenic regions have been stored
321 //  in the data structure. Frees the memory used by the the syntenic
322 //  regions once the output of extendClusters and flushSyntenys has
323 //  been produced.
324 
325 {
326   //-- For all the contained syntenys
327   for(auto& CurrSp : Syntenys) {
328     //-- If no clusters, ignore
329     if(CurrSp.clusters.empty()) continue;
330     //-- Extend clusters and create the alignment information
331     //      alignments.clear();
332     std::vector<Alignment> alignments;
333     extendClusters (CurrSp.clusters, CurrSp.AfP->seq(), CurrSp.AfP->len(), Bf.seq(), Bf.len(), alignments);
334     //-- Output the alignment data to the delta file
335     matches(std::move(alignments), *CurrSp.AfP, Bf);
336   }
337 
338   //-- Create the cluster information
339   clusters(Syntenys, Bf);
340   Syntenys.clear();
341 }
342 
343 template<typename Container, typename FR2, typename ClustersOut, typename MatchesOut>
processSyntenys_long_each(Container & Syntenys,const FR2 & Bf,ClustersOut clusters,MatchesOut matches) const344 void merge_syntenys::processSyntenys_long_each(Container& Syntenys, const FR2& Bf,
345                                                ClustersOut clusters, MatchesOut matches) const
346 
347 //  For each syntenic region with clusters, extend the clusters to
348 //  expand total alignment coverage. Only should be called once all
349 //  the clusters for the contained syntenic regions have been stored
350 //  in the data structure. Frees the memory used by the the syntenic
351 //  regions once the output of extendClusters and flushSyntenys has
352 //  been produced.
353 
354 {
355   std::atomic<size_t> pos(0);
356   const auto end = Syntenys.end();
357 
358 //#pragma omp parallel
359   {
360     auto CurrSp = Syntenys.begin();
361     size_t prev = 0;
362 
363     while(true) {
364       const size_t npos = pos++;
365       for( ; prev < npos && CurrSp != end; ++prev, ++CurrSp) ; // Advance
366       if(CurrSp == end) break;
367       if(CurrSp->clusters.empty()) continue;
368       //-- Extend clusters and create the alignment information
369       //      alignments.clear();
370       std::vector<Alignment> alignments;
371       extendClusters (CurrSp->clusters, CurrSp->AfP->seq(), CurrSp->AfP->len(), Bf.seq(), Bf.len(), alignments);
372       //-- Output the alignment data to the delta file
373       matches(std::move(alignments), *CurrSp->AfP, Bf);
374     }
375   }
376 
377   //-- Create the cluster information
378   clusters(Syntenys, Bf);
379   Syntenys.clear();
380 }
381 
382 template<typename FastaRecord>
printSyntenys(const std::vector<Synteny<FastaRecord>> & Syntenys,const FastaRecord & Bf,std::ostream & ClusterFile)383 void printSyntenys(const std::vector<Synteny<FastaRecord> > & Syntenys, const FastaRecord& Bf, std::ostream& ClusterFile)
384 
385 //  Simply output the synteny/cluster information generated by the mgaps
386 //  program. However, now the coordinates reference their appropriate
387 //  reference sequence, and the reference sequecne header is added to
388 //  the appropriate lines. Free the memory used by Syntenys once the
389 //  data is successfully output to the file.
390 
391 {
392   if ( ClusterFile ) {
393     for(const auto& Sp : Syntenys) { // each syntenys
394       ClusterFile << '>' << Sp.AfP->Id() << ' ' << Bf.Id() << ' '
395                   << Sp.AfP->len() << ' ' << Bf.len() << '\n';
396 
397       for (const auto& Cp : Sp.clusters) { // each clusters
398         ClusterFile << std::setw(2) << FORWARD_CHAR << ' ' << std::setw(2) << Cp.dirB << '\n';
399 
400         for (auto Mp = Cp.matches.cbegin( ); Mp != Cp.matches.cend( ); ++Mp ) { // each match
401             ClusterFile << std::setw(8) << Mp->sA << ' '
402                         << std::setw(8) << (Cp.dirB == FORWARD_CHAR ? Mp->sB : revC(Mp->sB, Bf.len())) << ' '
403                         << std::setw(6) << Mp->len;
404           if ( Mp != Cp.matches.cbegin( ) )
405             ClusterFile << std::setw(6) << (Mp->sA - (Mp - 1)->sA - (Mp - 1)->len) << ' '
406                         << std::setw(6) << (Mp->sB - (Mp - 1)->sB - (Mp - 1)->len) << '\n';
407           else
408             ClusterFile << "     -      -\n";
409         }
410       }
411     }
412   }
413 }
414 
415 template<typename FR1, typename FR2>
printSAMAlignments(const std::vector<Alignment> & Alignments,const FR1 & A,const FR2 & B,std::ostream & SAMFile,bool long_format,const long minLen)416 void printSAMAlignments(const std::vector<Alignment>& Alignments,
417                         const FR1& A, const FR2& B,
418                         std::ostream& SAMFile, bool long_format,
419                         const long minLen) {
420   const char* mapq = Alignments.size() > 1 ? "\t10\t" : "\t30\t";
421   bool hard_clip = false;
422   for(const auto& Al : Alignments) {
423     if(std::abs(Al.eA - Al.sA) < minLen && std::abs(Al.eB - Al.sB) < minLen)
424       continue;
425     const bool fwd = Al.dirB == FORWARD_CHAR;
426     SAMFile << B.Id() << '\t'
427             << ((hard_clip ? 0x800 : 0) | (fwd ? 0 : 0x10)) << '\t'
428             << A.Id() << '\t' << Al.sA
429             << mapq
430             << createCIGAR(Al.delta, Al.sB, Al.eB, B.len(), hard_clip)
431             << "\t*\t0\t0\t";
432     if(long_format) {
433       const auto start = hard_clip ? Al.sB : 1;
434       const auto len   = hard_clip ? Al.eB - start + 1 : B.len();
435       SAMFile.write(B.seq() + start, len);
436     } else {
437       SAMFile << '*';
438     }
439     SAMFile << "\t*\tNM:i:" << Al.Errors;
440     if(long_format)
441       SAMFile << "\tMD:Z:" << createMD(Al, A.seq(), B.seq(), B.len());
442     SAMFile << '\n';
443     hard_clip = true;
444   }
445 }
446 
447 
448 } // namespace postnuc
449 } // namespace mummer
450 
451 #endif /* __MUMMER_POSTNUC_H__ */
452