1 //
2 // kmeralign.cpp
3 // Mothur
4 //
5 // Created by Pat Schloss on 4/6/14.
6 // Copyright (c) 2014 Schloss Lab. All rights reserved.
7 //
8
9 #include "kmeralign.h"
10 #include "kmer.hpp"
11 #include "alignment.hpp"
12
13 /**************************************************************************************************/
14
KmerAlign(int k)15 KmerAlign::KmerAlign(int k) : kmerSize(k), kmerLibrary(k), Alignment() {
16 try {
17 int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
18 //maxKmer = kmerLibrary.getMaxKmer();
19 maxKmer = power4s[kmerSize]+1;// (int)pow(4.,k)+1;
20 }
21 catch(exception& e) {
22 m->errorOut(e, "KmerAlign", "KmerAlign");
23 exit(1);
24 }
25 }
26 /**************************************************************************************************/
27
~KmerAlign()28 KmerAlign::~KmerAlign(){ /* do nothing */ }
29
30 /**************************************************************************************************/
31 //modelled after pandaseqs kmer align, assemble.c
align(string A,string B,bool createBaseMap)32 void KmerAlign::align(string A, string B, bool createBaseMap){
33 try {
34 int aLength = A.length();
35 int bLength = B.length();
36 int maxOverlap = aLength;
37 if (bLength < aLength) { maxOverlap = bLength; }
38 maxOverlap -= 2;
39
40 int nKmersA = A.length() - kmerSize + 1;
41 vector< vector<int> > kmerseen;
42 //set all kmers to unseen
43 kmerseen.resize(maxKmer);
44 //for (int i = 0; i < maxKmer; i++) { kmerseen[i].resize(numKmers, 0); }
45
46 int kmer;
47 /* Scan forward sequence building k-mers and appending the position to kmerseen[k] */
48 for(int i=0;i<nKmersA;i++){
49 kmer = kmerLibrary.getKmerNumber(A, i);
50 if (kmer != (maxKmer-1)) { kmerseen[kmer].push_back(i); }
51 }
52 int nKmersB = B.length() - kmerSize + 1;
53
54 /* Scan reverse sequence building k-mers. For each position in the forward sequence for this kmer (i.e., kmerseen[k]), flag that we should check the corresponding overlap. */
55 set<int> overlaps;
56 for(int i=0;i<nKmersB;i++){
57 kmer = kmerLibrary.getKmerNumber(B, i);
58 for (int j = 0; j < kmerseen[kmer].size(); j++) { //for as many instances as we saw of this kmer, recoding overlap
59 int index = aLength + bLength - (bLength-i) - kmerseen[kmer][j] - 2;
60 if (index <= maxOverlap) { overlaps.insert(index); }
61 }
62 }
63
64 if (overlaps.size() == 0) { //add all possible overlaps
65 for (int i = 0; i <= maxOverlap; i++) { overlaps.insert(i); } //minoverlap=2
66 }
67
68 //find best overlap
69 double bestProb = -1.38629 * (aLength+bLength);
70 int bestOverlap = -1;
71 for (set<int>::iterator it = overlaps.begin(); it != overlaps.end(); it++) {
72 int index = *it;
73 int overlap = index + 2; //2 = minoverlap
74 double probability = calcProb(A, B, overlap);
75 //printf("overlap prob: %i, %f\n", overlap, probability);
76 if (probability > bestProb && overlap >= 2) {
77 bestProb = probability;
78 bestOverlap = overlap;
79 }
80 }
81 //printf("best overlap prob: %i, %f\n", bestOverlap, bestProb);
82 if(bestOverlap != -1){
83 if((aLength-bestOverlap) > 0){ //add gaps to the start of B
84 int numGaps = (aLength-bestOverlap);
85 B = string(numGaps, '-') + B;
86 if (createBaseMap) {
87 for (int i = 0; i < bLength; i++) { BBaseMap[i+numGaps] = i; }
88 for (int i = 0; i < aLength; i++) { ABaseMap[i] = i; }
89 }
90 }else {
91 if (createBaseMap) {
92 for (int i = 0; i < bLength; i++) { BBaseMap[i] = i; }
93 for (int i = 0; i < aLength; i++) { ABaseMap[i] = i; }
94 }
95 }
96 }
97 int diff = B.length() - A.length();
98 if(diff > 0){
99 A = A + string(diff, '-');
100 }
101
102 seqAaln = A;
103 seqBaln = B;
104 pairwiseLength = seqAaln.length();
105
106 }
107 catch(exception& e) {
108 m->errorOut(e, "KmerAlign", "align");
109 exit(1);
110 }
111
112 }
113 /**************************************************************************************************/
114 //modelled after pandaseqs kmer align, assemble.c
calcProb(string A,string B,int overlap)115 double KmerAlign::calcProb(string A, string B, int overlap){
116 try {
117 double prob = 0;
118 int aLength = A.length();
119 int bLength = B.length();
120 int unknown, match, mismatch; unknown = 0; match = 0; mismatch = 0;
121
122 for (int i = 0; i < overlap; i++) {
123 int findex = aLength + i - overlap;
124 int rindex = i;
125 if (findex < 0 || rindex < 0 || findex >= aLength || rindex >= bLength)
126 continue;
127 char f = A[findex];
128 char r = B[rindex];
129 if ((f == 'N') || (r == 'N')) {
130 unknown++;
131 } else if (r == f) {
132 match++;
133 } else {
134 mismatch++;
135 }
136 }
137 //ln(0.25 * (1 - 2 * 0.36 + 0.36 * 0.36))
138 double pmatch = -2.278869;
139 //ln((3 * 0.36 - 2 * 0.36 * 0.36) / 18.0)
140 double pmismatch = -3.087848;
141 if (overlap >= aLength && overlap >= bLength) {
142 prob = (-1.38629 * unknown + match * pmatch + mismatch * pmismatch);
143 } else {
144 prob = (-1.38629 * (aLength + bLength - 2 * overlap + unknown) + match * pmatch + mismatch * pmismatch);
145 }
146
147
148 return prob;
149 }
150 catch(exception& e) {
151 m->errorOut(e, "KmerAlign", "calcProb");
152 exit(1);
153 }
154 }
155 /**************************************************************************************************/
156
157
158