1 #include <unordered_map>
2 #include "bamRemoveDuplicates.h"
3 #include <iostream>
4 #include <htslib/sam.h>
5 #include "IncludeDefine.h"
6 #include SAMTOOLS_BGZF_H
7 #include "ErrorWarning.h"
8 
9 #define compareReturn(a,b) if(a>b) {return 1;} else if (a<b) {return -1;}
10 
11 uint g_bamRemoveDuplicatesMate2basesN;
12 
funCompareNames(const void * a,const void * b)13 int funCompareNames(const void *a, const void *b) {//compare read names
14     uint32* pa=(uint32*) *(uint32**) a;
15     uint32* pb=(uint32*) *(uint32**) b;
16 
17     uint32 la=(pa[3]<<24)>>24;
18     uint32 lb=(pb[3]<<24)>>24;
19 
20     compareReturn(la,lb) else {
21         char* ca=(char*) (pa+9);
22         char* cb=(char*) (pb+9);
23         for (uint32 ii=0;ii<la;ii++) {
24             compareReturn(ca[ii],cb[ii]);
25         };
26         uint32 fa=pa[4]>>16;
27         uint32 fb=pb[4]>>16;
28 
29         compareReturn((fa&0x80), (fb&0x80));
30         return 0;
31     };
32 };
33 
funStartExtendS(const uint32 * const p)34 uint32 funStartExtendS(const uint32* const p) {//calculates align start extending right S operation
35     uint32* cig=(uint32*) (((char*) p)+9*4+((p[3]<<24)>>24));
36     if ( ((cig[0]<<28)>>28) == 4 ) {//first (right) operation is S
37         return p[2]-(cig[0]>>4);
38     } else {
39         return p[2];
40     };
41 };
42 
funCigarExtendS(const uint32 * const p,uint32 * cout)43 uint32 funCigarExtendS(const uint32* const p, uint32* cout) {
44     uint32* cig=(uint32*) (((char*) p)+9*4+((p[3]<<24)>>24));
45     uint32 n=(p[4]<<16)>>16, n1=n;
46 
47     if (((cig[0]<<28)>>28) == 4) {
48         --n1;
49         memcpy((char*) cout, (char*) (cig+1), n1*sizeof(uint32));//copy CIGAR starting from the 2nd operation
50         cout[0]+=(cig[0]>>4)<<4;
51     } else {
52         memcpy((char*) cout, (char*) cig, n*sizeof(uint32));//copy full CIGAR
53     };
54     if (((cig[n-1]<<28)>>28) == 4) {//remove last S opeartion add length to previous M
55         --n1;
56         cout[n1-1]+=(cig[n-1]>>4)<<4;
57     };
58     return n1;
59 };
60 
funCompareCigarsExtendS(const uint32 * const pa,const uint32 * const pb)61 int funCompareCigarsExtendS(const uint32* const pa, const uint32* const pb){
62     uint32 ca[100], cb[100];
63     uint32 na=funCigarExtendS(pa,ca);
64     uint32 nb=funCigarExtendS(pb,cb);
65     compareReturn(na,nb);
66     for (uint32 ii=0; ii<na; ii++) {
67         compareReturn(ca[ii],cb[ii]);
68     };
69     return 0;
70 };
71 
funCompareCoordFlagCigarSeq(const void * a,const void * b)72 int funCompareCoordFlagCigarSeq(const void *a, const void *b) {
73     uint32* pa1=(uint32*) *(uint32**) a;
74     uint32* pa2=(uint32*) *(uint32**) ((char*)a+8);
75 
76     uint32* pb1=(uint32*) *(uint32**) b;
77     uint32* pb2=(uint32*) *(uint32**) ((char*)b+8);
78 
79     compareReturn(funStartExtendS(pa1),funStartExtendS(pb1));//position match
80     compareReturn(funStartExtendS(pa2),funStartExtendS(pb2));//2nd mate position match
81     compareReturn(pa1[4]>>16,pb1[4]>>16);//FLAG match
82     compareReturn(pa2[4]>>16,pb2[4]>>16);//FLAG match - 2nd mate
83 
84     int ret1=funCompareCigarsExtendS(pa1,pb1);
85     if (ret1!=0) return ret1;
86     ret1=funCompareCigarsExtendS(pa2,pb2);
87     if (ret1!=0) return ret1;
88 
89     //compare sequences
90     uint8_t* sa=((uint8_t*) pa2)+9*4+((pa2[3]<<24)>>24)+((pa2[4]<<16)>>16)*4;
91     uint8_t* sb=((uint8_t*) pb2)+9*4+((pb2[3]<<24)>>24)+((pb2[4]<<16)>>16)*4;
92     if (((pa2[4]>>16) & 0x10) == 0) {//not reverse complemented
93         uint ii=1;
94         for (; ii<g_bamRemoveDuplicatesMate2basesN; ii+=2) {
95             compareReturn(sa[ii/2],sb[ii/2]);
96         };
97         if (g_bamRemoveDuplicatesMate2basesN%2>0) {
98             compareReturn((sa[ii/2]>>4),(sb[ii/2]>>4));
99         };
100     } else {
101         uint32 ii=pa2[5]-g_bamRemoveDuplicatesMate2basesN;
102         if (ii%2>0) {
103             compareReturn((sa[ii/2]&15),(sb[ii/2]&15));
104             ++ii;
105         };
106         for (; ii<pa2[5]; ii+=2) {
107             compareReturn(sa[ii/2],sb[ii/2]);
108         };
109     };
110 
111     return 0;
112 };
113 
bamRemoveDuplicates(const string bamFileName,const string bamFileNameOut,Parameters & P)114 void bamRemoveDuplicates(const string bamFileName, const string bamFileNameOut, Parameters &P) {
115     g_bamRemoveDuplicatesMate2basesN=P.removeDuplicates.mate2basesN;
116 
117     bam1_t *bamA;
118     bamA=bam_init1();
119 
120     BGZF *bamIn=bgzf_open(bamFileName.c_str(),"r");
121     if (bamIn==NULL) {
122         exitWithError("EXITING because of fatal ERROR: could not open --inputBAMfile " + bamFileName +
123                       "SOLUTION: check that file   " + bamFileName + "   exists and has proper read permissions."
124                         , std::cerr, P.inOut->logMain, EXIT_CODE_PARAMETER, P);
125     };
126     bam_hdr_t *bamHeader=bam_hdr_read(bamIn);
127 
128     BGZF *bgzfOut;
129     bgzfOut=bgzf_open(bamFileNameOut.c_str(),("w"+to_string((long long) P.outBAMcompression)).c_str());
130     bam_hdr_write(bgzfOut, bamHeader);
131 
132     uint bamLengthMax=P.limitBAMsortRAM; //max length to load
133     if (bamLengthMax==0) bamLengthMax=16000000000;
134     uint grNmax=1000000;//max number of alignments
135 
136     char *bamRaw=new char[bamLengthMax];
137     uint *aD=new uint[grNmax*2];
138 
139     uint bamLength=0;
140     uint bamS=0, bamE=0, bamE1=1; //start/end/next-end position for read group search
141     uint32 rightMax=0;
142     uint grN=0;//number of reads in group
143     bool bamFileEnd=false;//when the last alignment of rhe file was reached
144     while (true) {
145         if (bamE1>bamLength) {//reached end of loaded BAM block, add BAM data
146             if (bamLength<bamLengthMax && bamLength>0) {//reached end of BAM file, cannot load more
147                 bamFileEnd=true;
148             } else {
149                 if (bamS==0 && bamLength>0) {//TODO
150                     ostringstream errOut;
151                     errOut <<"EXITING because of fatal ERROR: not enough memory for marking duplicates \n";
152                     errOut <<"SOLUTION: re-run STAR with at least --limitBAMsortRAM " <<bamLengthMax*2;
153                     exitWithError(errOut.str(), std::cerr, P.inOut->logMain, EXIT_CODE_PARAMETER, P);
154                 };
155 
156                 //write out processed block
157                 bgzf_write(bgzfOut,bamRaw,bamS);
158 
159                 bamLength-=bamS;
160                 memmove(bamRaw, bamRaw+bamS,bamLength); //move the non-processed part of the block to the beginning of bamRaw
161                 bamLength+=bgzf_read(bamIn, bamRaw+bamLength, bamLengthMax-bamLength);//marks the end of the BAM block that has been read
162                 //restart search for the group
163                 bamS=0;
164                 bamE=0;
165                 bamE1=bamE+*(uint32*)(bamRaw+bamE)+4;//next alignment
166                 rightMax=0;
167                 grN=0;
168             };
169         };
170 
171         int nMult=0;
172         uint32 chrE=0;
173         uint32 leftE=0;
174         uint32 rightE=0;
175         uint32 chrS=0;
176 
177         if (!bamFileEnd)
178         {
179             uint32* bamP=(uint32*) (bamRaw+bamE);//pointer to the 1st mate of the pair
180 
181             bamA->data=((uint8_t*) bamP)+9*4+((bamP[3]<<24)>>24)+((bamP[4]<<16)>>16)*4+(bamP[5]+1)/2+bamP[5];//add length for: core, name, cigar, seq, qual
182             bamA->l_data=((uint8_t*) bamP)+bamP[0]+1-bamA->data;
183 
184             nMult=bam_aux2i(bam_aux_get(bamA,"NH"));
185 
186             if (nMult==1 || (nMult>1 && P.removeDuplicates.markMulti))
187             {
188                 bamP[4] |= (0x400<<16);//mark all aligns as duplicate, will unmark. If multimappers, onyl mark if markMult=true
189             };
190 
191             chrE=bamP[1];
192             leftE=bamP[2];
193             rightE=bamP[7];
194 
195             chrS=*(uint32*)(bamRaw+bamS+4*1);
196         };
197 
198         if ( chrE !=chrS ||  (rightMax>0 && leftE>rightMax) || bamFileEnd ) {//found new group of reads to be processed, start collapsing procedure
199             qsort((void*) aD, grN, sizeof(uint), funCompareNames);
200             qsort((void*) aD, grN/2, 2*sizeof(uint), funCompareCoordFlagCigarSeq);
201             //go through the list and select non-duplicates
202             int bScore=-999, bP=0;
203             for (uint pp=0; pp<grN/2; pp++) {
204                 uint32* bamP1=(uint32*) aD[pp*2];//pointer to the 1st mate of the pair
205                 bamA->data=((uint8_t*) bamP1)+9*4+((bamP1[3]<<24)>>24)+((bamP1[4]<<16)>>16)*4+(bamP1[5]+1)/2+bamP1[5];//add length for: core, name, cigar, seq, qual
206                 bamA->l_data=((uint8_t*) bamP1)+bamP1[0]+1-bamA->data;
207                 int score1=bam_aux2i(bam_aux_get(bamA,"AS"));
208                 if (score1>bScore) {
209                     bScore=score1;
210                     bP=pp;
211                 };
212 
213                 if ( pp==(grN/2-1) || funCompareCoordFlagCigarSeq((void*) (aD+pp*2),(void*) (aD+pp*2+2))!=0 ) {//next pair is not equal to the current one
214                     //un-mark duplicates
215                     uint32* bamPb=(uint32*) aD[bP*2+1];//pointer to the 2nd mate of the pair
216                     bamPb[4] ^= (0x400<<16);
217                     bamPb=(uint32*) aD[bP*2];//pointer to the 1st mate of the pair
218                     bamPb[4] ^= (0x400<<16);
219                     //cout << ((char*)(bamPb+9)) <<"\n";
220                     bScore=-999;//reset best score
221                 };
222             };
223 
224 
225             //reset for the next group
226             if (bamFileEnd) break; //exit the main cycle over blocks
227             rightMax=0;
228             bamS=bamE;
229             grN=0;
230         };
231 
232         if (nMult==1) {//record this alignment in the current group, unique mappers only. Multi-mappers will not be considered for collapsing, and will remain marked as duplicates
233             if (grN>=grNmax) {//reallocate
234                 grNmax=grN*2;
235                 uint *aD1=new uint[grNmax];
236                 memcpy((char*) aD1, (char*) aD, grN*sizeof(uint));
237                 delete [] aD;
238                 aD=aD1;
239                 cerr << "reallocated array "<<grNmax<<endl;
240             };
241             aD[grN]=(uint) bamRaw+bamE;
242             ++grN;
243             if (rightE>leftE) {//left mate, record coordinate of its right mate
244                 rightMax=max(rightMax, rightE);
245             };
246         };
247 
248         bamE=bamE1;//shift to the next record
249         bamE1=bamE+*(uint32*)(bamRaw+bamE)+4;//next alignment
250 
251     };
252 
253     bgzf_write(bgzfOut,bamRaw,bamLength);
254     bgzf_flush(bgzfOut);
255     bgzf_close(bgzfOut);
256 };
257