1 #include "SequenceFuns.h"
2 #include <assert.h>
3 
complementSeqNumbers(char * ReadsIn,char * ReadsOut,uint Lread)4 void complementSeqNumbers(char* ReadsIn, char* ReadsOut, uint Lread) {//complement the numeric sequences
5     for (uint jj=0;jj<Lread;jj++) {
6         switch (int(ReadsIn[jj])){
7             case (3): ReadsOut[jj]=char(0);break;
8             case (2): ReadsOut[jj]=char(1);break;
9             case (1): ReadsOut[jj]=char(2);break;
10             case (0): ReadsOut[jj]=char(3);break;
11             default:  ReadsOut[jj]=ReadsIn[jj];
12         };
13     };
14 };
15 
revComplementNucleotides(char * ReadsIn,char * ReadsOut,uint Lread)16 void revComplementNucleotides(char* ReadsIn, char* ReadsOut, uint Lread) {//complement the numeric sequences
17     for (uint jj=0;jj<Lread;jj++) {
18         switch (ReadsIn[Lread-1-jj]){
19             case ('A'): ReadsOut[jj]='T';break;
20             case ('C'): ReadsOut[jj]='G';break;
21             case ('G'): ReadsOut[jj]='C';break;
22             case ('T'): ReadsOut[jj]='A';break;
23             case ('N'): ReadsOut[jj]='N';break;
24             case ('R'): ReadsOut[jj]='Y';break;
25             case ('Y'): ReadsOut[jj]='R';break;
26             case ('K'): ReadsOut[jj]='M';break;
27             case ('M'): ReadsOut[jj]='K';break;
28             case ('S'): ReadsOut[jj]='S';break;
29             case ('W'): ReadsOut[jj]='W';break;
30             case ('B'): ReadsOut[jj]='V';break;
31             case ('D'): ReadsOut[jj]='H';break;
32             case ('V'): ReadsOut[jj]='B';break;
33             case ('H'): ReadsOut[jj]='D';break;
34 
35             case ('a'): ReadsOut[jj]='t';break;
36             case ('c'): ReadsOut[jj]='g';break;
37             case ('g'): ReadsOut[jj]='c';break;
38             case ('t'): ReadsOut[jj]='a';break;
39             case ('n'): ReadsOut[jj]='n';break;
40             case ('r'): ReadsOut[jj]='y';break;
41             case ('y'): ReadsOut[jj]='r';break;
42             case ('k'): ReadsOut[jj]='m';break;
43             case ('m'): ReadsOut[jj]='k';break;
44             case ('s'): ReadsOut[jj]='s';break;
45             case ('w'): ReadsOut[jj]='w';break;
46             case ('b'): ReadsOut[jj]='v';break;
47             case ('d'): ReadsOut[jj]='h';break;
48             case ('v'): ReadsOut[jj]='b';break;
49             case ('h'): ReadsOut[jj]='d';break;
50 
51             default:   ReadsOut[jj]=ReadsIn[Lread-1-jj];
52         };
53     };
54 };
55 
revComplementNucleotides(string & seq)56 void revComplementNucleotides(string &seq) {//complement the numeric sequences
57     string seq1(seq);
58     for (uint jj=0;jj<seq.size();jj++) {
59         switch (seq1[seq.size()-1-jj]){
60             case ('A'): seq[jj]='T';break;
61             case ('C'): seq[jj]='G';break;
62             case ('G'): seq[jj]='C';break;
63             case ('T'): seq[jj]='A';break;
64             case ('N'): seq[jj]='N';break;
65             case ('R'): seq[jj]='Y';break;
66             case ('Y'): seq[jj]='R';break;
67             case ('K'): seq[jj]='M';break;
68             case ('M'): seq[jj]='K';break;
69             case ('S'): seq[jj]='S';break;
70             case ('W'): seq[jj]='W';break;
71             case ('B'): seq[jj]='V';break;
72             case ('D'): seq[jj]='H';break;
73             case ('V'): seq[jj]='B';break;
74             case ('H'): seq[jj]='D';break;
75 
76             case ('a'): seq[jj]='t';break;
77             case ('c'): seq[jj]='g';break;
78             case ('g'): seq[jj]='c';break;
79             case ('t'): seq[jj]='a';break;
80             case ('n'): seq[jj]='n';break;
81             case ('r'): seq[jj]='y';break;
82             case ('y'): seq[jj]='r';break;
83             case ('k'): seq[jj]='m';break;
84             case ('m'): seq[jj]='k';break;
85             case ('s'): seq[jj]='s';break;
86             case ('w'): seq[jj]='w';break;
87             case ('b'): seq[jj]='v';break;
88             case ('d'): seq[jj]='h';break;
89             case ('v'): seq[jj]='b';break;
90             case ('h'): seq[jj]='d';break;
91 
92             default:   seq[jj]=seq1[seq.size()-1-jj];
93         };
94     };
95 };
96 
97 
98 
nuclToNumBAM(char cc)99 char  nuclToNumBAM(char cc){
100     switch (cc) {//=ACMGRSVTWYHKDBN
101         case ('='): cc=0;break;
102         case ('A'): case ('a'): cc=1;break;
103         case ('C'): case ('c'): cc=2;break;
104         case ('M'): case ('m'): cc=3;break;
105         case ('G'): case ('g'): cc=4;break;
106         case ('R'): case ('r'): cc=5;break;
107         case ('S'): case ('s'): cc=6;break;
108         case ('V'): case ('v'): cc=7;break;
109         case ('T'): case ('t'): cc=8;break;
110         case ('W'): case ('w'): cc=9;break;
111         case ('Y'): case ('y'): cc=10;break;
112         case ('H'): case ('h'): cc=11;break;
113         case ('K'): case ('k'): cc=12;break;
114         case ('D'): case ('d'): cc=13;break;
115         case ('B'): case ('b'): cc=14;break;
116         case ('N'): case ('n'): cc=15;break;
117         default: cc=15;
118     };
119     return cc;
120 };
121 
nuclPackBAM(char * ReadsIn,char * ReadsOut,uint Lread)122 void nuclPackBAM(char* ReadsIn, char* ReadsOut, uint Lread) {//pack nucleotides for BAM
123     for (uint jj=0;jj<Lread/2;jj++) {
124         ReadsOut[jj]=nuclToNumBAM(ReadsIn[2*jj])<<4 | nuclToNumBAM(ReadsIn[2*jj+1]);
125     };
126     if (Lread%2==1) {
127         ReadsOut[Lread/2]=nuclToNumBAM(ReadsIn[Lread-1])<<4;
128     };
129 };
130 
convertNucleotidesToNumbers(const char * R0,char * R1,const uint Lread)131 void convertNucleotidesToNumbers(const char* R0, char* R1, const uint Lread) {//transform sequence  from ACGT into 0-1-2-3 code
132     for (uint jj=0;jj<Lread;jj++) {
133                     switch (int(R0[jj])){
134                         case (65): case(97):
135                             R1[jj]=char(0);break;//A
136                         case (67): case(99):
137                             R1[jj]=char(1);break;//C
138                         case (71): case(103):
139                             R1[jj]=char(2);break;//G
140                         case (84): case(116):
141                             R1[jj]=char(3);break;//T
142                         default:
143                             R1[jj]=char(4);//anything else is converted to N
144                     };
145                 };
146 };
147 
convertCapitalBasesToNum(uint8_t * rS,uint64_t N)148 void convertCapitalBasesToNum(uint8_t *rS, uint64_t N)
149 {//only capital bases are allowed
150     for (uint64_t ib=0; ib<N; ib++) {
151         switch (rS[ib]) {
152             case 'A':
153                 rS[ib]=0;
154                 break;
155             case 'C':
156                 rS[ib]=1;
157                 break;
158             case 'G':
159                 rS[ib]=2;
160                 break;
161             case 'T':
162                 rS[ib]=3;
163                 break;
164             default:
165                 rS[ib]=4;
166         };
167     };
168 };
169 
convertNucleotidesToNumbersRemoveControls(const char * R0,char * R1,const uint Lread)170 uint convertNucleotidesToNumbersRemoveControls(const char* R0, char* R1, const uint Lread) {//transform sequence  from ACGT into 0-1-2-3 code
171     uint iR1=0;
172     for (uint jj=0;jj<Lread;jj++) {
173         switch (int(R0[jj])){
174             case (65): case(97):
175                 R1[jj]=char(0);break;//A
176             case (67): case(99):
177                 R1[jj]=char(1);break;//C
178             case (71): case(103):
179                 R1[jj]=char(2);break;//G
180             case (84): case(116):
181                 R1[jj]=char(3);break;//T
182             default:
183                 if (int(R0[jj]) < 32) {//control characters are skipped
184                     continue;
185                 } else {//all non-control non-ACGT characters are convreted to N
186                     R1[jj]=char(4);//anything else
187                 };
188         };
189         ++iR1;
190     };
191     return iR1;
192 };
193 
194 
convertNt01234(const char R0)195 char convertNt01234(const char R0) {//transform sequence  from ACGT into 0-1-2-3 code
196     switch(R0)
197     {
198         case('a'):
199         case('A'):
200             return 0;
201             break;
202         case('c'):
203         case('C'):
204             return 1;
205             break;
206         case('g'):
207         case('G'):
208             return 2;
209             break;
210         case('t'):
211         case('T'):
212             return 3;
213             break;
214         default:
215             return 4;
216     };
217 };
218 
convertNuclStrToInt32(const string S,uint32 & intOut)219 int32 convertNuclStrToInt32(const string S, uint32 &intOut) {
220     intOut=0;
221     int32 posN=-1;
222     for (uint32 ii=0; ii<S.size(); ii++) {
223         uint32 nt = (uint32) convertNt01234(S.at(ii));
224         if (nt>3) {//N
225             if (posN>=0)
226                 return -2; //two Ns
227             posN=ii;
228             nt=0;
229         };
230         intOut = intOut << 2;
231         intOut +=nt;
232         //intOut += nt<<(2*ii);
233     };
234     return posN;
235 };
236 
convertNuclInt32toString(uint32 nuclNum,const uint32 L)237 string convertNuclInt32toString(uint32 nuclNum, const uint32 L) {
238     string nuclOut(L,'N');
239     string nuclChar="ACGT";
240 
241     for (uint32 ii=1; ii<=L; ii++) {
242         nuclOut[L-ii] = nuclChar[nuclNum & 3];
243         nuclNum = nuclNum >> 2;
244     };
245 
246     return nuclOut;
247 };
248 
convertNuclStrToInt64(const string S,uint64 & intOut)249 int64 convertNuclStrToInt64(const string S, uint64 &intOut) {
250     intOut=0;
251     int64 posN=-1;
252     for (uint64 ii=0; ii<S.size(); ii++) {
253         uint64 nt = (uint64) convertNt01234(S[ii]);
254         if (nt>3) {//N
255             if (posN>=0)
256                 return -2; //two Ns
257             posN=ii;
258             nt=0;
259         };
260         intOut = intOut << 2;
261         intOut +=nt;
262         //intOut += nt<<(2*ii);
263     };
264     return posN;
265 };
266 
convertNuclInt64toString(uint64 nuclNum,const uint32 L)267 string convertNuclInt64toString(uint64 nuclNum, const uint32 L) {
268     string nuclOut(L,'N');
269     string nuclChar="ACGT";
270 
271     for (uint64 ii=1; ii<=L; ii++) {
272         nuclOut[L-ii] = nuclChar[nuclNum & 3];
273         nuclNum = nuclNum >> 2;
274     };
275 
276     return nuclOut;
277 };
278 
279 
chrFind(uint Start,uint i2,uint * chrStart)280 uint chrFind(uint Start, uint i2, uint* chrStart) {// find chromosome from global locus
281     uint i1=0, i3;
282     while (i1+1<i2) {
283         i3=(i1+i2)/2;
284         if ( chrStart[i3] > Start ) {
285             i2=i3;
286         } else {
287             i1=i3;
288         };
289     };
290     return i1;
291 };
292 
localSearch(const char * x,uint nx,const char * y,uint ny,double pMM)293 uint localSearch(const char *x, uint nx, const char *y, uint ny, double pMM){
294     //find the best alignment of two short sequences x and y
295     //pMM is the maximum percentage of mismatches
296     uint nMatch=0, nMM=0, nMatchBest=0, nMMbest=0, ixBest=nx;
297     for (uint ix=0;ix<nx;ix++) {
298         nMatch=0; nMM=0;
299         for (uint iy=0;iy<min(ny,nx-ix);iy++) {
300             if (x[ix+iy]>3) continue;
301             if (x[ix+iy]==y[iy]) {
302                 nMatch++;
303             } else {
304                 nMM++;
305             };
306         };
307 
308         if ( ( nMatch>nMatchBest || (nMatch==nMatchBest && nMM<nMMbest) ) && double(nMM)/double(nMatch)<=pMM) {
309             ixBest=ix;
310             nMatchBest=nMatch;
311             nMMbest=nMM;
312         };
313     };
314     return ixBest;
315 };
316 
localSearchNisMM(const char * x,uint nx,const char * y,uint ny,double pMM)317 uint localSearchNisMM(const char *x, uint nx, const char *y, uint ny, double pMM){
318     //find the best alignment of two short sequences x and y
319     //pMM is the maximum percentage of mismatches
320     //Ns in x OR y are considered mismatches
321     uint nMatch=0, nMM=0, nMatchBest=0, nMMbest=0, ixBest=nx;
322     for (uint ix=0;ix<nx;ix++) {
323         nMatch=0; nMM=0;
324         for (uint iy=0;iy<min(ny,nx-ix);iy++) {
325             if (x[ix+iy]==y[iy] && y[iy]<4) {
326                 nMatch++;
327             } else {
328                 nMM++;
329             };
330         };
331 
332         if ( ( nMatch>nMatchBest || (nMatch==nMatchBest && nMM<nMMbest) ) && double(nMM)/double(nMatch)<=pMM) {
333             ixBest=ix;
334             nMatchBest=nMatch;
335             nMMbest=nMM;
336         };
337     };
338     return ixBest;
339 };
340 
localAlignHammingDist(const string & text,const string & query,uint32 & pos)341 uint32 localAlignHammingDist(const string &text, const string &query, uint32 &pos)
342 {
343     uint32 distBest=query.size();
344     if (text.size()<query.size()) {//query is longer than text, no match
345     	return text.size()+1;
346     };
347     for (uint32 ii=0; ii<text.size()-query.size()+1; ii++) {
348         uint32 dist1=0;
349         for (uint32 jj=0; jj<query.size(); jj++) {
350             if (query[jj]!='N' && text[jj+ii]!=query[jj]) {//N in query does not count as mismatch
351                 ++dist1;
352             };
353         };
354         if (dist1<distBest) {
355             distBest=dist1;
356             pos=ii;
357         };
358     };
359     return distBest;
360 };
361 
362 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
363 /*
364 uint32 localSearchGeneral(const char *text, const uint32 textLen, const vector<char> &query, const int32 textStart, const int32 textEnd, double pMM, vector <uint32> vecMM, uint32 &nMM)
365 {
366     assert(textEnd <= (int32)textLen);
367     assert(textStart + (int32)query.size() >= 0);
368 
369     nMM=0;
370 
371     uint32 nMatchBest=0;
372     int32 posBest=textLen;
373     uint32 clippedL = 0;
374 
375 
376     int32 dirSearch = (textStart <= textEnd ? 1 : -1); //search direction
377 
378     for (int32 pos=textStart; pos!=textEnd; pos+=dirSearch) {
379         int32 qs = max(0, -pos);
380         int32 qe = min((uint32)query.size(), (uint32)(textLen-pos) );
381 
382         uint32 nMatch1=0, nMM1=0;
383 
384         for (uint32 iq=qs; iq<qe; iq++) {
385             if (text[pos+iq]>3)
386                 continue; //Ns in the text are not counted as matches or mismatches
387             if (text[pos+iq]==query[iq]) {
388                 nMatch1++;
389             } else {
390                 nMM1++;
391                 if ( nMM1 >= vecMM.size() ) {
392                     nMatch1=0;
393                     break; //too many mismatches
394                 };
395             };
396         };
397 
398         //if ( (nMatch1>nMatchBest || (nMatch1==nMatchBest && nMM1<nMM)) && double(nMM1)<=double(nMatch1)*pMM ) {
399         if ( (nMatch1>nMatchBest || (nMatch1==nMatchBest && nMM1<nMM)) && nMM1<vecMM.size() && (qe-qs)>=vecMM[nMM1]) {
400             posBest=pos;
401             nMatchBest=nMatch1;
402             nMM=nMM1;
403             clippedL = (uint32)(textStart <= textEnd ? posBest+(int32)query.size(): -posBest+(int32)textLen );
404         };
405     };
406 
407     return clippedL;
408 };
409 */
410 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////
qualitySplit(char * r,uint L,uint maxNsplit,uint minLsplit,uint ** splitR)411 uint qualitySplit(char* r,uint L, uint maxNsplit, uint  minLsplit, uint** splitR) {
412     //splits the read r[L] by quality scores q[L], outputs in splitR - split coordinate/length - per base
413     //returns number of good split regions
414     uint iR=0,iS=0,iR1,LgoodMin=0, iFrag=0;
415     while ( (iR<L) & (iS<maxNsplit) ) { //main cycle
416         //find next good base
417         while ( iR<L && r[iR]>3 ) {
418             if (r[iR]==MARK_FRAG_SPACER_BASE)
419                 iFrag++; //count read fragments
420             iR++;
421         };
422 
423         if (iR==L) break; //exit when reached end of read
424 
425         iR1=iR;
426 
427         //find the next bad base
428         while ( iR<L && r[iR]<=3 ) {
429             iR++;
430         };
431 
432         if ( (iR-iR1)>LgoodMin ) LgoodMin=iR-iR1;
433         if ( (iR-iR1)<minLsplit ) continue; //too short for a good region
434 
435         splitR[0][iS]=iR1;      //good region start
436         splitR[1][iS]=iR-iR1;   //good region length
437         splitR[2][iS]=iFrag;    //good region fragment
438         iS++;
439     };
440 
441     if (iS==0) splitR[1][0]=LgoodMin; //output min good piece length
442 
443     return iS;
444 };
445 
446