1 #include <unordered_map>
2 #include "bamRemoveDuplicates.h"
3 #include <iostream>
4 #include <htslib/sam.h>
5 #include "IncludeDefine.h"
6 #include SAMTOOLS_BGZF_H
7 #include "ErrorWarning.h"
8
9 #define compareReturn(a,b) if(a>b) {return 1;} else if (a<b) {return -1;}
10
11 uint g_bamRemoveDuplicatesMate2basesN;
12
funCompareNames(const void * a,const void * b)13 int funCompareNames(const void *a, const void *b) {//compare read names
14 uint32* pa=(uint32*) *(uint32**) a;
15 uint32* pb=(uint32*) *(uint32**) b;
16
17 uint32 la=(pa[3]<<24)>>24;
18 uint32 lb=(pb[3]<<24)>>24;
19
20 compareReturn(la,lb) else {
21 char* ca=(char*) (pa+9);
22 char* cb=(char*) (pb+9);
23 for (uint32 ii=0;ii<la;ii++) {
24 compareReturn(ca[ii],cb[ii]);
25 };
26 uint32 fa=pa[4]>>16;
27 uint32 fb=pb[4]>>16;
28
29 compareReturn((fa&0x80), (fb&0x80));
30 return 0;
31 };
32 };
33
funStartExtendS(const uint32 * const p)34 uint32 funStartExtendS(const uint32* const p) {//calculates align start extending right S operation
35 uint32* cig=(uint32*) (((char*) p)+9*4+((p[3]<<24)>>24));
36 if ( ((cig[0]<<28)>>28) == 4 ) {//first (right) operation is S
37 return p[2]-(cig[0]>>4);
38 } else {
39 return p[2];
40 };
41 };
42
funCigarExtendS(const uint32 * const p,uint32 * cout)43 uint32 funCigarExtendS(const uint32* const p, uint32* cout) {
44 uint32* cig=(uint32*) (((char*) p)+9*4+((p[3]<<24)>>24));
45 uint32 n=(p[4]<<16)>>16, n1=n;
46
47 if (((cig[0]<<28)>>28) == 4) {
48 --n1;
49 memcpy((char*) cout, (char*) (cig+1), n1*sizeof(uint32));//copy CIGAR starting from the 2nd operation
50 cout[0]+=(cig[0]>>4)<<4;
51 } else {
52 memcpy((char*) cout, (char*) cig, n*sizeof(uint32));//copy full CIGAR
53 };
54 if (((cig[n-1]<<28)>>28) == 4) {//remove last S opeartion add length to previous M
55 --n1;
56 cout[n1-1]+=(cig[n-1]>>4)<<4;
57 };
58 return n1;
59 };
60
funCompareCigarsExtendS(const uint32 * const pa,const uint32 * const pb)61 int funCompareCigarsExtendS(const uint32* const pa, const uint32* const pb){
62 uint32 ca[100], cb[100];
63 uint32 na=funCigarExtendS(pa,ca);
64 uint32 nb=funCigarExtendS(pb,cb);
65 compareReturn(na,nb);
66 for (uint32 ii=0; ii<na; ii++) {
67 compareReturn(ca[ii],cb[ii]);
68 };
69 return 0;
70 };
71
funCompareCoordFlagCigarSeq(const void * a,const void * b)72 int funCompareCoordFlagCigarSeq(const void *a, const void *b) {
73 uint32* pa1=(uint32*) *(uint32**) a;
74 uint32* pa2=(uint32*) *(uint32**) ((char*)a+8);
75
76 uint32* pb1=(uint32*) *(uint32**) b;
77 uint32* pb2=(uint32*) *(uint32**) ((char*)b+8);
78
79 compareReturn(funStartExtendS(pa1),funStartExtendS(pb1));//position match
80 compareReturn(funStartExtendS(pa2),funStartExtendS(pb2));//2nd mate position match
81 compareReturn(pa1[4]>>16,pb1[4]>>16);//FLAG match
82 compareReturn(pa2[4]>>16,pb2[4]>>16);//FLAG match - 2nd mate
83
84 int ret1=funCompareCigarsExtendS(pa1,pb1);
85 if (ret1!=0) return ret1;
86 ret1=funCompareCigarsExtendS(pa2,pb2);
87 if (ret1!=0) return ret1;
88
89 //compare sequences
90 uint8_t* sa=((uint8_t*) pa2)+9*4+((pa2[3]<<24)>>24)+((pa2[4]<<16)>>16)*4;
91 uint8_t* sb=((uint8_t*) pb2)+9*4+((pb2[3]<<24)>>24)+((pb2[4]<<16)>>16)*4;
92 if (((pa2[4]>>16) & 0x10) == 0) {//not reverse complemented
93 uint ii=1;
94 for (; ii<g_bamRemoveDuplicatesMate2basesN; ii+=2) {
95 compareReturn(sa[ii/2],sb[ii/2]);
96 };
97 if (g_bamRemoveDuplicatesMate2basesN%2>0) {
98 compareReturn((sa[ii/2]>>4),(sb[ii/2]>>4));
99 };
100 } else {
101 uint32 ii=pa2[5]-g_bamRemoveDuplicatesMate2basesN;
102 if (ii%2>0) {
103 compareReturn((sa[ii/2]&15),(sb[ii/2]&15));
104 ++ii;
105 };
106 for (; ii<pa2[5]; ii+=2) {
107 compareReturn(sa[ii/2],sb[ii/2]);
108 };
109 };
110
111 return 0;
112 };
113
bamRemoveDuplicates(const string bamFileName,const string bamFileNameOut,Parameters & P)114 void bamRemoveDuplicates(const string bamFileName, const string bamFileNameOut, Parameters &P) {
115 g_bamRemoveDuplicatesMate2basesN=P.removeDuplicates.mate2basesN;
116
117 bam1_t *bamA;
118 bamA=bam_init1();
119
120 BGZF *bamIn=bgzf_open(bamFileName.c_str(),"r");
121 if (bamIn==NULL) {
122 exitWithError("EXITING because of fatal ERROR: could not open --inputBAMfile " + bamFileName +
123 "SOLUTION: check that file " + bamFileName + " exists and has proper read permissions."
124 , std::cerr, P.inOut->logMain, EXIT_CODE_PARAMETER, P);
125 };
126 bam_hdr_t *bamHeader=bam_hdr_read(bamIn);
127
128 BGZF *bgzfOut;
129 bgzfOut=bgzf_open(bamFileNameOut.c_str(),("w"+to_string((long long) P.outBAMcompression)).c_str());
130 bam_hdr_write(bgzfOut, bamHeader);
131
132 uint bamLengthMax=P.limitBAMsortRAM; //max length to load
133 if (bamLengthMax==0) bamLengthMax=16000000000;
134 uint grNmax=1000000;//max number of alignments
135
136 char *bamRaw=new char[bamLengthMax];
137 uint *aD=new uint[grNmax*2];
138
139 uint bamLength=0;
140 uint bamS=0, bamE=0, bamE1=1; //start/end/next-end position for read group search
141 uint32 rightMax=0;
142 uint grN=0;//number of reads in group
143 bool bamFileEnd=false;//when the last alignment of rhe file was reached
144 while (true) {
145 if (bamE1>bamLength) {//reached end of loaded BAM block, add BAM data
146 if (bamLength<bamLengthMax && bamLength>0) {//reached end of BAM file, cannot load more
147 bamFileEnd=true;
148 } else {
149 if (bamS==0 && bamLength>0) {//TODO
150 ostringstream errOut;
151 errOut <<"EXITING because of fatal ERROR: not enough memory for marking duplicates \n";
152 errOut <<"SOLUTION: re-run STAR with at least --limitBAMsortRAM " <<bamLengthMax*2;
153 exitWithError(errOut.str(), std::cerr, P.inOut->logMain, EXIT_CODE_PARAMETER, P);
154 };
155
156 //write out processed block
157 bgzf_write(bgzfOut,bamRaw,bamS);
158
159 bamLength-=bamS;
160 memmove(bamRaw, bamRaw+bamS,bamLength); //move the non-processed part of the block to the beginning of bamRaw
161 bamLength+=bgzf_read(bamIn, bamRaw+bamLength, bamLengthMax-bamLength);//marks the end of the BAM block that has been read
162 //restart search for the group
163 bamS=0;
164 bamE=0;
165 bamE1=bamE+*(uint32*)(bamRaw+bamE)+4;//next alignment
166 rightMax=0;
167 grN=0;
168 };
169 };
170
171 int nMult=0;
172 uint32 chrE=0;
173 uint32 leftE=0;
174 uint32 rightE=0;
175 uint32 chrS=0;
176
177 if (!bamFileEnd)
178 {
179 uint32* bamP=(uint32*) (bamRaw+bamE);//pointer to the 1st mate of the pair
180
181 bamA->data=((uint8_t*) bamP)+9*4+((bamP[3]<<24)>>24)+((bamP[4]<<16)>>16)*4+(bamP[5]+1)/2+bamP[5];//add length for: core, name, cigar, seq, qual
182 bamA->l_data=((uint8_t*) bamP)+bamP[0]+1-bamA->data;
183
184 nMult=bam_aux2i(bam_aux_get(bamA,"NH"));
185
186 if (nMult==1 || (nMult>1 && P.removeDuplicates.markMulti))
187 {
188 bamP[4] |= (0x400<<16);//mark all aligns as duplicate, will unmark. If multimappers, onyl mark if markMult=true
189 };
190
191 chrE=bamP[1];
192 leftE=bamP[2];
193 rightE=bamP[7];
194
195 chrS=*(uint32*)(bamRaw+bamS+4*1);
196 };
197
198 if ( chrE !=chrS || (rightMax>0 && leftE>rightMax) || bamFileEnd ) {//found new group of reads to be processed, start collapsing procedure
199 qsort((void*) aD, grN, sizeof(uint), funCompareNames);
200 qsort((void*) aD, grN/2, 2*sizeof(uint), funCompareCoordFlagCigarSeq);
201 //go through the list and select non-duplicates
202 int bScore=-999, bP=0;
203 for (uint pp=0; pp<grN/2; pp++) {
204 uint32* bamP1=(uint32*) aD[pp*2];//pointer to the 1st mate of the pair
205 bamA->data=((uint8_t*) bamP1)+9*4+((bamP1[3]<<24)>>24)+((bamP1[4]<<16)>>16)*4+(bamP1[5]+1)/2+bamP1[5];//add length for: core, name, cigar, seq, qual
206 bamA->l_data=((uint8_t*) bamP1)+bamP1[0]+1-bamA->data;
207 int score1=bam_aux2i(bam_aux_get(bamA,"AS"));
208 if (score1>bScore) {
209 bScore=score1;
210 bP=pp;
211 };
212
213 if ( pp==(grN/2-1) || funCompareCoordFlagCigarSeq((void*) (aD+pp*2),(void*) (aD+pp*2+2))!=0 ) {//next pair is not equal to the current one
214 //un-mark duplicates
215 uint32* bamPb=(uint32*) aD[bP*2+1];//pointer to the 2nd mate of the pair
216 bamPb[4] ^= (0x400<<16);
217 bamPb=(uint32*) aD[bP*2];//pointer to the 1st mate of the pair
218 bamPb[4] ^= (0x400<<16);
219 //cout << ((char*)(bamPb+9)) <<"\n";
220 bScore=-999;//reset best score
221 };
222 };
223
224
225 //reset for the next group
226 if (bamFileEnd) break; //exit the main cycle over blocks
227 rightMax=0;
228 bamS=bamE;
229 grN=0;
230 };
231
232 if (nMult==1) {//record this alignment in the current group, unique mappers only. Multi-mappers will not be considered for collapsing, and will remain marked as duplicates
233 if (grN>=grNmax) {//reallocate
234 grNmax=grN*2;
235 uint *aD1=new uint[grNmax];
236 memcpy((char*) aD1, (char*) aD, grN*sizeof(uint));
237 delete [] aD;
238 aD=aD1;
239 cerr << "reallocated array "<<grNmax<<endl;
240 };
241 aD[grN]=(uint) bamRaw+bamE;
242 ++grN;
243 if (rightE>leftE) {//left mate, record coordinate of its right mate
244 rightMax=max(rightMax, rightE);
245 };
246 };
247
248 bamE=bamE1;//shift to the next record
249 bamE1=bamE+*(uint32*)(bamRaw+bamE)+4;//next alignment
250
251 };
252
253 bgzf_write(bgzfOut,bamRaw,bamLength);
254 bgzf_flush(bgzfOut);
255 bgzf_close(bgzfOut);
256 };
257