1 //
2 //  kmeralign.cpp
3 //  Mothur
4 //
5 //  Created by Pat Schloss on 4/6/14.
6 //  Copyright (c) 2014 Schloss Lab. All rights reserved.
7 //
8 
9 #include "kmeralign.h"
10 #include "kmer.hpp"
11 #include "alignment.hpp"
12 
13 /**************************************************************************************************/
14 
KmerAlign(int k)15 KmerAlign::KmerAlign(int k) : kmerSize(k), kmerLibrary(k), Alignment() {
16 	try {
17         int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
18         //maxKmer = kmerLibrary.getMaxKmer();
19         maxKmer = power4s[kmerSize]+1;// (int)pow(4.,k)+1;
20 	}
21 	catch(exception& e) {
22 		m->errorOut(e, "KmerAlign", "KmerAlign");
23 		exit(1);
24 	}
25 }
26 /**************************************************************************************************/
27 
~KmerAlign()28 KmerAlign::~KmerAlign(){	/*	do nothing	*/	}
29 
30 /**************************************************************************************************/
31 //modelled after pandaseqs kmer align, assemble.c
align(string A,string B,bool createBaseMap)32 void KmerAlign::align(string A, string B,  bool createBaseMap){
33 	try {
34         int aLength = A.length();
35         int bLength = B.length();
36         int maxOverlap = aLength;
37         if (bLength < aLength) { maxOverlap = bLength; }
38         maxOverlap -= 2;
39 
40         int nKmersA = A.length() - kmerSize + 1;
41         vector< vector<int> > kmerseen;
42         //set all kmers to unseen
43         kmerseen.resize(maxKmer);
44         //for (int i = 0; i < maxKmer; i++) { kmerseen[i].resize(numKmers, 0); }
45 
46         int kmer;
47         /* Scan forward sequence building k-mers and appending the position to kmerseen[k] */
48         for(int i=0;i<nKmersA;i++){
49             kmer = kmerLibrary.getKmerNumber(A, i);
50             if (kmer != (maxKmer-1)) {  kmerseen[kmer].push_back(i);  }
51         }
52         int nKmersB = B.length() - kmerSize + 1;
53 
54         /* Scan reverse sequence building k-mers. For each position in the forward sequence for this kmer (i.e., kmerseen[k]), flag that we should check the corresponding overlap. */
55         set<int> overlaps;
56         for(int i=0;i<nKmersB;i++){
57             kmer = kmerLibrary.getKmerNumber(B, i);
58             for (int j = 0; j < kmerseen[kmer].size(); j++) {  //for as many instances as we saw of this kmer, recoding overlap
59                 int index = aLength + bLength - (bLength-i) - kmerseen[kmer][j] - 2;
60                 if (index <= maxOverlap) { overlaps.insert(index); }
61             }
62         }
63 
64         if (overlaps.size() == 0) { //add all possible overlaps
65             for (int i = 0; i <= maxOverlap; i++) {  overlaps.insert(i);  } //minoverlap=2
66         }
67 
68         //find best overlap
69         double bestProb = -1.38629 * (aLength+bLength);
70         int bestOverlap = -1;
71         for (set<int>::iterator it = overlaps.begin(); it != overlaps.end(); it++) {
72             int index = *it;
73             int overlap = index + 2; //2 = minoverlap
74             double probability = calcProb(A, B, overlap);
75             //printf("overlap prob: %i, %f\n", overlap, probability);
76             if (probability > bestProb && overlap >= 2) {
77                 bestProb = probability;
78                 bestOverlap = overlap;
79             }
80         }
81         //printf("best overlap prob: %i, %f\n", bestOverlap, bestProb);
82         if(bestOverlap != -1){
83             if((aLength-bestOverlap) > 0){ //add gaps to the start of B
84                 int numGaps = (aLength-bestOverlap);
85                 B = string(numGaps, '-') + B;
86                 if (createBaseMap) {
87                     for (int i = 0; i < bLength; i++) {  BBaseMap[i+numGaps] = i;   }
88                     for (int i = 0; i < aLength; i++) {  ABaseMap[i] = i;           }
89                 }
90             }else {
91                 if (createBaseMap) {
92                     for (int i = 0; i < bLength; i++) {  BBaseMap[i] = i;   }
93                     for (int i = 0; i < aLength; i++) {  ABaseMap[i] = i;   }
94                 }
95             }
96         }
97         int diff = B.length() - A.length();
98         if(diff > 0){
99             A = A + string(diff, '-');
100         }
101 
102         seqAaln = A;
103         seqBaln = B;
104         pairwiseLength = seqAaln.length();
105 
106     }
107 	catch(exception& e) {
108 		m->errorOut(e, "KmerAlign", "align");
109 		exit(1);
110 	}
111 
112 }
113 /**************************************************************************************************/
114 //modelled after pandaseqs kmer align, assemble.c
calcProb(string A,string B,int overlap)115 double KmerAlign::calcProb(string A, string B, int overlap){
116     try {
117         double prob = 0;
118         int aLength = A.length();
119         int bLength = B.length();
120         int unknown, match, mismatch; unknown = 0; match = 0; mismatch = 0;
121 
122         for (int i = 0; i < overlap; i++) {
123             int findex = aLength + i - overlap;
124             int rindex = i;
125             if (findex < 0 || rindex < 0 || findex >= aLength || rindex >= bLength)
126                 continue;
127             char f = A[findex];
128             char r = B[rindex];
129             if ((f == 'N') || (r == 'N')) {
130                 unknown++;
131             } else if (r == f) {
132                 match++;
133             } else {
134                 mismatch++;
135             }
136         }
137         //ln(0.25 * (1 - 2 * 0.36 + 0.36 * 0.36))
138         double pmatch = -2.278869;
139         //ln((3 * 0.36 - 2 * 0.36 * 0.36) / 18.0)
140         double pmismatch = -3.087848;
141         if (overlap >= aLength && overlap >= bLength) {
142             prob = (-1.38629 * unknown + match * pmatch + mismatch * pmismatch);
143         } else {
144             prob = (-1.38629 * (aLength + bLength - 2 * overlap + unknown) + match * pmatch + mismatch * pmismatch);
145         }
146 
147 
148         return prob;
149     }
150     catch(exception& e) {
151         m->errorOut(e, "KmerAlign", "calcProb");
152         exit(1);
153     }
154 }
155 /**************************************************************************************************/
156 
157 
158