1 #include "SequenceFuns.h"
2 #include <assert.h>
3
complementSeqNumbers(char * ReadsIn,char * ReadsOut,uint Lread)4 void complementSeqNumbers(char* ReadsIn, char* ReadsOut, uint Lread) {//complement the numeric sequences
5 for (uint jj=0;jj<Lread;jj++) {
6 switch (int(ReadsIn[jj])){
7 case (3): ReadsOut[jj]=char(0);break;
8 case (2): ReadsOut[jj]=char(1);break;
9 case (1): ReadsOut[jj]=char(2);break;
10 case (0): ReadsOut[jj]=char(3);break;
11 default: ReadsOut[jj]=ReadsIn[jj];
12 };
13 };
14 };
15
revComplementNucleotides(char * ReadsIn,char * ReadsOut,uint Lread)16 void revComplementNucleotides(char* ReadsIn, char* ReadsOut, uint Lread) {//complement the numeric sequences
17 for (uint jj=0;jj<Lread;jj++) {
18 switch (ReadsIn[Lread-1-jj]){
19 case ('A'): ReadsOut[jj]='T';break;
20 case ('C'): ReadsOut[jj]='G';break;
21 case ('G'): ReadsOut[jj]='C';break;
22 case ('T'): ReadsOut[jj]='A';break;
23 case ('N'): ReadsOut[jj]='N';break;
24 case ('R'): ReadsOut[jj]='Y';break;
25 case ('Y'): ReadsOut[jj]='R';break;
26 case ('K'): ReadsOut[jj]='M';break;
27 case ('M'): ReadsOut[jj]='K';break;
28 case ('S'): ReadsOut[jj]='S';break;
29 case ('W'): ReadsOut[jj]='W';break;
30 case ('B'): ReadsOut[jj]='V';break;
31 case ('D'): ReadsOut[jj]='H';break;
32 case ('V'): ReadsOut[jj]='B';break;
33 case ('H'): ReadsOut[jj]='D';break;
34
35 case ('a'): ReadsOut[jj]='t';break;
36 case ('c'): ReadsOut[jj]='g';break;
37 case ('g'): ReadsOut[jj]='c';break;
38 case ('t'): ReadsOut[jj]='a';break;
39 case ('n'): ReadsOut[jj]='n';break;
40 case ('r'): ReadsOut[jj]='y';break;
41 case ('y'): ReadsOut[jj]='r';break;
42 case ('k'): ReadsOut[jj]='m';break;
43 case ('m'): ReadsOut[jj]='k';break;
44 case ('s'): ReadsOut[jj]='s';break;
45 case ('w'): ReadsOut[jj]='w';break;
46 case ('b'): ReadsOut[jj]='v';break;
47 case ('d'): ReadsOut[jj]='h';break;
48 case ('v'): ReadsOut[jj]='b';break;
49 case ('h'): ReadsOut[jj]='d';break;
50
51 default: ReadsOut[jj]=ReadsIn[Lread-1-jj];
52 };
53 };
54 };
55
revComplementNucleotides(string & seq)56 void revComplementNucleotides(string &seq) {//complement the numeric sequences
57 string seq1(seq);
58 for (uint jj=0;jj<seq.size();jj++) {
59 switch (seq1[seq.size()-1-jj]){
60 case ('A'): seq[jj]='T';break;
61 case ('C'): seq[jj]='G';break;
62 case ('G'): seq[jj]='C';break;
63 case ('T'): seq[jj]='A';break;
64 case ('N'): seq[jj]='N';break;
65 case ('R'): seq[jj]='Y';break;
66 case ('Y'): seq[jj]='R';break;
67 case ('K'): seq[jj]='M';break;
68 case ('M'): seq[jj]='K';break;
69 case ('S'): seq[jj]='S';break;
70 case ('W'): seq[jj]='W';break;
71 case ('B'): seq[jj]='V';break;
72 case ('D'): seq[jj]='H';break;
73 case ('V'): seq[jj]='B';break;
74 case ('H'): seq[jj]='D';break;
75
76 case ('a'): seq[jj]='t';break;
77 case ('c'): seq[jj]='g';break;
78 case ('g'): seq[jj]='c';break;
79 case ('t'): seq[jj]='a';break;
80 case ('n'): seq[jj]='n';break;
81 case ('r'): seq[jj]='y';break;
82 case ('y'): seq[jj]='r';break;
83 case ('k'): seq[jj]='m';break;
84 case ('m'): seq[jj]='k';break;
85 case ('s'): seq[jj]='s';break;
86 case ('w'): seq[jj]='w';break;
87 case ('b'): seq[jj]='v';break;
88 case ('d'): seq[jj]='h';break;
89 case ('v'): seq[jj]='b';break;
90 case ('h'): seq[jj]='d';break;
91
92 default: seq[jj]=seq1[seq.size()-1-jj];
93 };
94 };
95 };
96
97
98
nuclToNumBAM(char cc)99 char nuclToNumBAM(char cc){
100 switch (cc) {//=ACMGRSVTWYHKDBN
101 case ('='): cc=0;break;
102 case ('A'): case ('a'): cc=1;break;
103 case ('C'): case ('c'): cc=2;break;
104 case ('M'): case ('m'): cc=3;break;
105 case ('G'): case ('g'): cc=4;break;
106 case ('R'): case ('r'): cc=5;break;
107 case ('S'): case ('s'): cc=6;break;
108 case ('V'): case ('v'): cc=7;break;
109 case ('T'): case ('t'): cc=8;break;
110 case ('W'): case ('w'): cc=9;break;
111 case ('Y'): case ('y'): cc=10;break;
112 case ('H'): case ('h'): cc=11;break;
113 case ('K'): case ('k'): cc=12;break;
114 case ('D'): case ('d'): cc=13;break;
115 case ('B'): case ('b'): cc=14;break;
116 case ('N'): case ('n'): cc=15;break;
117 default: cc=15;
118 };
119 return cc;
120 };
121
nuclPackBAM(char * ReadsIn,char * ReadsOut,uint Lread)122 void nuclPackBAM(char* ReadsIn, char* ReadsOut, uint Lread) {//pack nucleotides for BAM
123 for (uint jj=0;jj<Lread/2;jj++) {
124 ReadsOut[jj]=nuclToNumBAM(ReadsIn[2*jj])<<4 | nuclToNumBAM(ReadsIn[2*jj+1]);
125 };
126 if (Lread%2==1) {
127 ReadsOut[Lread/2]=nuclToNumBAM(ReadsIn[Lread-1])<<4;
128 };
129 };
130
convertNucleotidesToNumbers(const char * R0,char * R1,const uint Lread)131 void convertNucleotidesToNumbers(const char* R0, char* R1, const uint Lread) {//transform sequence from ACGT into 0-1-2-3 code
132 for (uint jj=0;jj<Lread;jj++) {
133 switch (int(R0[jj])){
134 case (65): case(97):
135 R1[jj]=char(0);break;//A
136 case (67): case(99):
137 R1[jj]=char(1);break;//C
138 case (71): case(103):
139 R1[jj]=char(2);break;//G
140 case (84): case(116):
141 R1[jj]=char(3);break;//T
142 default:
143 R1[jj]=char(4);//anything else is converted to N
144 };
145 };
146 };
147
convertCapitalBasesToNum(uint8_t * rS,uint64_t N)148 void convertCapitalBasesToNum(uint8_t *rS, uint64_t N)
149 {//only capital bases are allowed
150 for (uint64_t ib=0; ib<N; ib++) {
151 switch (rS[ib]) {
152 case 'A':
153 rS[ib]=0;
154 break;
155 case 'C':
156 rS[ib]=1;
157 break;
158 case 'G':
159 rS[ib]=2;
160 break;
161 case 'T':
162 rS[ib]=3;
163 break;
164 default:
165 rS[ib]=4;
166 };
167 };
168 };
169
convertNucleotidesToNumbersRemoveControls(const char * R0,char * R1,const uint Lread)170 uint convertNucleotidesToNumbersRemoveControls(const char* R0, char* R1, const uint Lread) {//transform sequence from ACGT into 0-1-2-3 code
171 uint iR1=0;
172 for (uint jj=0;jj<Lread;jj++) {
173 switch (int(R0[jj])){
174 case (65): case(97):
175 R1[jj]=char(0);break;//A
176 case (67): case(99):
177 R1[jj]=char(1);break;//C
178 case (71): case(103):
179 R1[jj]=char(2);break;//G
180 case (84): case(116):
181 R1[jj]=char(3);break;//T
182 default:
183 if (int(R0[jj]) < 32) {//control characters are skipped
184 continue;
185 } else {//all non-control non-ACGT characters are convreted to N
186 R1[jj]=char(4);//anything else
187 };
188 };
189 ++iR1;
190 };
191 return iR1;
192 };
193
194
convertNt01234(const char R0)195 char convertNt01234(const char R0) {//transform sequence from ACGT into 0-1-2-3 code
196 switch(R0)
197 {
198 case('a'):
199 case('A'):
200 return 0;
201 break;
202 case('c'):
203 case('C'):
204 return 1;
205 break;
206 case('g'):
207 case('G'):
208 return 2;
209 break;
210 case('t'):
211 case('T'):
212 return 3;
213 break;
214 default:
215 return 4;
216 };
217 };
218
convertNuclStrToInt32(const string S,uint32 & intOut)219 int32 convertNuclStrToInt32(const string S, uint32 &intOut) {
220 intOut=0;
221 int32 posN=-1;
222 for (uint32 ii=0; ii<S.size(); ii++) {
223 uint32 nt = (uint32) convertNt01234(S.at(ii));
224 if (nt>3) {//N
225 if (posN>=0)
226 return -2; //two Ns
227 posN=ii;
228 nt=0;
229 };
230 intOut = intOut << 2;
231 intOut +=nt;
232 //intOut += nt<<(2*ii);
233 };
234 return posN;
235 };
236
convertNuclInt32toString(uint32 nuclNum,const uint32 L)237 string convertNuclInt32toString(uint32 nuclNum, const uint32 L) {
238 string nuclOut(L,'N');
239 string nuclChar="ACGT";
240
241 for (uint32 ii=1; ii<=L; ii++) {
242 nuclOut[L-ii] = nuclChar[nuclNum & 3];
243 nuclNum = nuclNum >> 2;
244 };
245
246 return nuclOut;
247 };
248
convertNuclStrToInt64(const string S,uint64 & intOut)249 int64 convertNuclStrToInt64(const string S, uint64 &intOut) {
250 intOut=0;
251 int64 posN=-1;
252 for (uint64 ii=0; ii<S.size(); ii++) {
253 uint64 nt = (uint64) convertNt01234(S[ii]);
254 if (nt>3) {//N
255 if (posN>=0)
256 return -2; //two Ns
257 posN=ii;
258 nt=0;
259 };
260 intOut = intOut << 2;
261 intOut +=nt;
262 //intOut += nt<<(2*ii);
263 };
264 return posN;
265 };
266
convertNuclInt64toString(uint64 nuclNum,const uint32 L)267 string convertNuclInt64toString(uint64 nuclNum, const uint32 L) {
268 string nuclOut(L,'N');
269 string nuclChar="ACGT";
270
271 for (uint64 ii=1; ii<=L; ii++) {
272 nuclOut[L-ii] = nuclChar[nuclNum & 3];
273 nuclNum = nuclNum >> 2;
274 };
275
276 return nuclOut;
277 };
278
279
chrFind(uint Start,uint i2,uint * chrStart)280 uint chrFind(uint Start, uint i2, uint* chrStart) {// find chromosome from global locus
281 uint i1=0, i3;
282 while (i1+1<i2) {
283 i3=(i1+i2)/2;
284 if ( chrStart[i3] > Start ) {
285 i2=i3;
286 } else {
287 i1=i3;
288 };
289 };
290 return i1;
291 };
292
localSearch(const char * x,uint nx,const char * y,uint ny,double pMM)293 uint localSearch(const char *x, uint nx, const char *y, uint ny, double pMM){
294 //find the best alignment of two short sequences x and y
295 //pMM is the maximum percentage of mismatches
296 uint nMatch=0, nMM=0, nMatchBest=0, nMMbest=0, ixBest=nx;
297 for (uint ix=0;ix<nx;ix++) {
298 nMatch=0; nMM=0;
299 for (uint iy=0;iy<min(ny,nx-ix);iy++) {
300 if (x[ix+iy]>3) continue;
301 if (x[ix+iy]==y[iy]) {
302 nMatch++;
303 } else {
304 nMM++;
305 };
306 };
307
308 if ( ( nMatch>nMatchBest || (nMatch==nMatchBest && nMM<nMMbest) ) && double(nMM)/double(nMatch)<=pMM) {
309 ixBest=ix;
310 nMatchBest=nMatch;
311 nMMbest=nMM;
312 };
313 };
314 return ixBest;
315 };
316
localSearchNisMM(const char * x,uint nx,const char * y,uint ny,double pMM)317 uint localSearchNisMM(const char *x, uint nx, const char *y, uint ny, double pMM){
318 //find the best alignment of two short sequences x and y
319 //pMM is the maximum percentage of mismatches
320 //Ns in x OR y are considered mismatches
321 uint nMatch=0, nMM=0, nMatchBest=0, nMMbest=0, ixBest=nx;
322 for (uint ix=0;ix<nx;ix++) {
323 nMatch=0; nMM=0;
324 for (uint iy=0;iy<min(ny,nx-ix);iy++) {
325 if (x[ix+iy]==y[iy] && y[iy]<4) {
326 nMatch++;
327 } else {
328 nMM++;
329 };
330 };
331
332 if ( ( nMatch>nMatchBest || (nMatch==nMatchBest && nMM<nMMbest) ) && double(nMM)/double(nMatch)<=pMM) {
333 ixBest=ix;
334 nMatchBest=nMatch;
335 nMMbest=nMM;
336 };
337 };
338 return ixBest;
339 };
340
localAlignHammingDist(const string & text,const string & query,uint32 & pos)341 uint32 localAlignHammingDist(const string &text, const string &query, uint32 &pos)
342 {
343 uint32 distBest=query.size();
344 if (text.size()<query.size()) {//query is longer than text, no match
345 return text.size()+1;
346 };
347 for (uint32 ii=0; ii<text.size()-query.size()+1; ii++) {
348 uint32 dist1=0;
349 for (uint32 jj=0; jj<query.size(); jj++) {
350 if (query[jj]!='N' && text[jj+ii]!=query[jj]) {//N in query does not count as mismatch
351 ++dist1;
352 };
353 };
354 if (dist1<distBest) {
355 distBest=dist1;
356 pos=ii;
357 };
358 };
359 return distBest;
360 };
361
362 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
363 /*
364 uint32 localSearchGeneral(const char *text, const uint32 textLen, const vector<char> &query, const int32 textStart, const int32 textEnd, double pMM, vector <uint32> vecMM, uint32 &nMM)
365 {
366 assert(textEnd <= (int32)textLen);
367 assert(textStart + (int32)query.size() >= 0);
368
369 nMM=0;
370
371 uint32 nMatchBest=0;
372 int32 posBest=textLen;
373 uint32 clippedL = 0;
374
375
376 int32 dirSearch = (textStart <= textEnd ? 1 : -1); //search direction
377
378 for (int32 pos=textStart; pos!=textEnd; pos+=dirSearch) {
379 int32 qs = max(0, -pos);
380 int32 qe = min((uint32)query.size(), (uint32)(textLen-pos) );
381
382 uint32 nMatch1=0, nMM1=0;
383
384 for (uint32 iq=qs; iq<qe; iq++) {
385 if (text[pos+iq]>3)
386 continue; //Ns in the text are not counted as matches or mismatches
387 if (text[pos+iq]==query[iq]) {
388 nMatch1++;
389 } else {
390 nMM1++;
391 if ( nMM1 >= vecMM.size() ) {
392 nMatch1=0;
393 break; //too many mismatches
394 };
395 };
396 };
397
398 //if ( (nMatch1>nMatchBest || (nMatch1==nMatchBest && nMM1<nMM)) && double(nMM1)<=double(nMatch1)*pMM ) {
399 if ( (nMatch1>nMatchBest || (nMatch1==nMatchBest && nMM1<nMM)) && nMM1<vecMM.size() && (qe-qs)>=vecMM[nMM1]) {
400 posBest=pos;
401 nMatchBest=nMatch1;
402 nMM=nMM1;
403 clippedL = (uint32)(textStart <= textEnd ? posBest+(int32)query.size(): -posBest+(int32)textLen );
404 };
405 };
406
407 return clippedL;
408 };
409 */
410 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////
qualitySplit(char * r,uint L,uint maxNsplit,uint minLsplit,uint ** splitR)411 uint qualitySplit(char* r,uint L, uint maxNsplit, uint minLsplit, uint** splitR) {
412 //splits the read r[L] by quality scores q[L], outputs in splitR - split coordinate/length - per base
413 //returns number of good split regions
414 uint iR=0,iS=0,iR1,LgoodMin=0, iFrag=0;
415 while ( (iR<L) & (iS<maxNsplit) ) { //main cycle
416 //find next good base
417 while ( iR<L && r[iR]>3 ) {
418 if (r[iR]==MARK_FRAG_SPACER_BASE)
419 iFrag++; //count read fragments
420 iR++;
421 };
422
423 if (iR==L) break; //exit when reached end of read
424
425 iR1=iR;
426
427 //find the next bad base
428 while ( iR<L && r[iR]<=3 ) {
429 iR++;
430 };
431
432 if ( (iR-iR1)>LgoodMin ) LgoodMin=iR-iR1;
433 if ( (iR-iR1)<minLsplit ) continue; //too short for a good region
434
435 splitR[0][iS]=iR1; //good region start
436 splitR[1][iS]=iR-iR1; //good region length
437 splitR[2][iS]=iFrag; //good region fragment
438 iS++;
439 };
440
441 if (iS==0) splitR[1][0]=LgoodMin; //output min good piece length
442
443 return iS;
444 };
445
446