1 #include "SoloFeature.h"
2 #include "serviceFuns.cpp"
3 #include "SimpleGoodTuring/sgt.h"
4 #include <math.h>
5 #include <unordered_set>
6 #include <map>
7 #include <random>
8 
9 double logMultinomialPDFsparse(const vector<double> &ambProfileLogP, const vector<uint32> &countCellGeneUMI, const uint32 stride, const uint32 shift, const int64 start, const uint32 nGenes, const vector<double> &logFactorial);
emptyDrops_CR()10 void SoloFeature::emptyDrops_CR()
11 {
12     if (nCB<=pSolo.cellFilter.eDcr.indMin) {
13         P.inOut->logMain << "emptyDrops_CR filtering: no empty cells found: nCB=" << nCB <<"   emptyCellMinIndex="<< pSolo.cellFilter.eDcr.indMin << "\n";
14         return;
15     };
16 
17     time_t rawTime;
18     time(&rawTime);
19     P.inOut->logMain << timeMonthDayTime(rawTime) <<" ... starting emptyDrops_CR filtering" <<endl;
20 
21     //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
22     //find genes that were detected in all cells
23     unordered_set<uint32> featDet;
24     for (uint32 icb=0; icb<nCB; icb++) {
25         for (uint32 ig=0; ig<nGenePerCB[icb]; ig++) {
26             uint32 irec=countCellGeneUMIindex[icb]+ig*countMatStride;
27             if (countCellGeneUMI[irec + pSolo.umiDedup.countInd.main] > 0)
28                 featDet.insert(countCellGeneUMI[irec]); //gene is present if it's count > 0 for
29         };
30     };
31     uint32 featDetN=featDet.size(); //total number of detected genes - this should have been done already?
32 
33     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
34     //indCount - total UMI per cell sorted descending
35     typedef struct {uint32 index, count;} IndCount;
36     vector<IndCount> indCount(nCB);
37     for (uint32 ii=0; ii<nCB; ii++) {
38         indCount[ii].index=ii;
39         indCount[ii].count=nUMIperCB[ii];
40     };
41     std::sort(indCount.begin(), indCount.end(), [](const IndCount &ic1, const IndCount &ic2) {
42                                                     return (ic1.count>ic2.count) || (ic1.count==ic2.count && ic1.index<ic2.index); //descending order by count, ascending by index
43                                                 });
44 
45     //////////////////////////////////////////////////////////////////////////////////////////
46     //ambient gene counts: sum gene expression over the collection of empty cells
47     vector<uint32> ambCount(featuresNumber,0);
48     for (auto icb=pSolo.cellFilter.eDcr.indMin; icb<min(nCB,pSolo.cellFilter.eDcr.indMax); icb++) {
49         auto icb1 = indCount[icb].index;
50         for (uint32 ig=0; ig<nGenePerCB[icb1]; ig++) {
51             auto irec = countCellGeneUMIindex[icb1]+ig*countMatStride;
52             ambCount[countCellGeneUMI[irec+0]] += countCellGeneUMI[irec + pSolo.umiDedup.countInd.main];
53         };
54     };
55     time(&rawTime);
56     P.inOut->logMain << timeMonthDayTime(rawTime) <<" ... finished ambient cells counting" <<endl;
57 
58     ////////////////////////////////////////////////////////////////////
59     //frequencies
60     map<uint32,uint32> ambCountFreq; //ordered map is not really needed
61     for (auto &ac: ambCount) {
62         ambCountFreq[ac]++;
63     };
64     if (ambCountFreq.size()<=1) {//only 0-frequency genes are in the empty cells. This is possible because nCB can contain ome cells with no genes - because of multigene
65         P.inOut->logMain << "emptyDrops_CR filtering: empty cells contain no genes\n";
66         return;
67     };
68     ambCountFreq[0] -= (featuresNumber-featDetN); //subtract genes that were not detected in *any* cells
69     uint32 maxFreq = ambCountFreq.rbegin()->first;
70 
71     ///////////////////////////////////////////////////////////////////////
72     //SGT
73     vector<double> ambCountFreqSGT(maxFreq+1);//up to max frequency
74     {//SGT estimate of ambient profile
75         SGT<uint32> sgt;
76         for (auto &cf: ambCountFreq) {
77             if (cf.first != 0)
78                 sgt.add(cf.first, cf.second);
79         };
80         sgt.analyse();
81 
82         for (uint32 freq=0; freq<=maxFreq; freq++) {
83             sgt.estimate(freq, ambCountFreqSGT[freq]);
84         };
85         ambCountFreqSGT[0] /= ambCountFreq[0]; //divide freq=0 probability equally among all undetected genes in ambient profile
86     };
87     time(&rawTime);
88     P.inOut->logMain << timeMonthDayTime(rawTime) <<" ... finished SGT"<<endl;
89 
90     //ambient profile for all features
91     vector<double> ambProfileLogP(featuresNumber, 0.0);//logarithm
92     vector<double> ambProfilePnon0, ambProfileLogPnon0;//only non-0 genes
93     {
94         for (uint32 ig=0; ig<featuresNumber; ig++) {
95             if (featDet.count(ig)>0) {//this is only needed if normalization below is performed
96                 ambProfileLogP[ig]=ambCountFreqSGT[ambCount[ig]];
97             };
98         };
99 
100         double norm1 = accumulate(ambProfileLogP.begin(), ambProfileLogP.end(), 0.0);
101         ambProfileLogPnon0.reserve(ambProfileLogP.size());
102         ambProfilePnon0.reserve(ambProfileLogP.size());
103         for (auto &cf: ambProfileLogP) {
104             if (cf>0) {
105                 cf /= norm1;//normalization is just in case
106                 ambProfilePnon0.push_back(cf);
107                 cf = std::log(cf);
108                 ambProfileLogPnon0.push_back(cf);
109             };
110         };
111     };
112 
113     time(&rawTime);
114     P.inOut->logMain << timeMonthDayTime(rawTime) <<" ... finished ambient profile"<<endl;
115 
116     //select candidate cells
117     uint32 iCandFirst, iCandLast; //first/last candidate cell in the descending sorted indCount
118     {
119         iCandFirst=filteredCells.nCellsSimple;//candidates start right after the cutoff for the simple filtering
120         uint32 minUMI = int(pSolo.cellFilter.eDcr.umiMinFracMedian * nUMIperCBsorted[filteredCells.nCellsSimple/2]);//this is not exactly median
121         minUMI = max(pSolo.cellFilter.eDcr.umiMin, minUMI);
122         for (iCandLast=iCandFirst; iCandLast<iCandFirst+pSolo.cellFilter.eDcr.candMaxN; iCandLast++) {
123             if (indCount[iCandLast].count<minUMI)
124                 break;
125         };
126         --iCandLast;
127 
128         time(&rawTime);
129         P.inOut->logMain << timeMonthDayTime(rawTime) << " ... candidate cells: minUMI="<< minUMI << "; number of candidate cells=" << iCandLast-iCandFirst+1 <<endl;
130         if (iCandLast<iCandFirst)
131             return; //no candidate cells to consider
132     };
133 
134     //calculate observed probability for each candidate
135     vector<double> obsLogProb(iCandLast-iCandFirst+1);
136     {
137         vector<double> logFactorial; //tabulate log-factorial
138         logFactorial.resize(indCount[iCandFirst].count+1);
139         logFactorial[1]=0;
140         for (uint32 cc=2; cc<logFactorial.size(); cc++)
141             logFactorial[cc]=logFactorial[cc-1]+std::log(cc);
142 
143         for (uint32 icand=0; icand<obsLogProb.size(); icand++) {
144             auto icell=indCount[icand+iCandFirst].index;
145             obsLogProb[icand]=logMultinomialPDFsparse(ambProfileLogP, countCellGeneUMI, countMatStride, pSolo.umiDedup.countInd.main, countCellGeneUMIindex[icell], nGenePerCB[icell], logFactorial);
146         };
147         time(&rawTime);
148     }
149     P.inOut->logMain << timeMonthDayTime(rawTime) << " ... finished observed logProb" <<endl;
150 
151     //simulate the probabilities for each cell count
152     vector<vector<double>> simLogProb(pSolo.cellFilter.eDcr.simN);
153     {
154         std::discrete_distribution<uint32> distrAmb ( ambProfilePnon0.begin(), ambProfilePnon0.end() );
155         auto maxCount=indCount[iCandFirst].count;
156 
157         //#pragma omp parallel for num_threads(P.runThreadN) //does not increase speed significantly - might be useful for larger number of simulations
158         for (uint64 isim=0; isim<simLogProb.size(); isim++) {
159             simLogProb[isim].resize(maxCount+1);
160             simLogProb[isim][0]=0;
161 
162             std::mt19937 rngGen(19760110LLU*(isim+1));
163 
164             vector<uint32> currCounts(ambProfilePnon0.size(), 0);
165             for (uint32 ic=1; ic<=maxCount; ic++) {
166                 uint32 ig1 = distrAmb(rngGen);
167                 currCounts[ig1]++;
168                 simLogProb[isim][ic] = simLogProb[isim][ic-1] + ambProfileLogPnon0[ig1] + std::log(ic) - std::log(currCounts[ig1]);
169             };
170         };
171     };
172     time(&rawTime);
173     P.inOut->logMain << timeMonthDayTime(rawTime) << " ... finished simulations" <<endl;
174 
175 
176     //p-values
177     typedef struct{uint32 index; double p; double padj;} IndPPadj;
178     vector<IndPPadj> pValues(obsLogProb.size());
179     {
180         for (uint32 icand=0; icand<obsLogProb.size(); icand++) {
181             pValues[icand].index=indCount[icand+iCandFirst].index;
182             auto count1=indCount[icand+iCandFirst].count;
183 
184             //auto funSumLess = [&] (uint32 n, vector<double> sp) { return n + (sp[count1]<obsLogProb[icand]); };
185             //uint32 nLowerP = std::accumulate<uint32>(simLogProb.begin(), simLogProb.end(), 0, funSumLess);
186             uint32 nLowerP=0;
187             //for (uint64 isim=0; isim<simLogProb.size(); isim++) {
188             for (auto &sp: simLogProb) {
189                 nLowerP += ( sp[count1]<obsLogProb[icand] );
190             };
191 
192             pValues[icand].p=double(1+nLowerP)/(1+simLogProb.size());
193         };
194         //BH
195         std::sort(pValues.begin(), pValues.end(), [](const IndPPadj &ip1, const IndPPadj &ip2) {return (ip1.p < ip2.p);} );
196         uint32 rank=0;
197         for (auto &ip: pValues) {
198             rank++;
199             ip.padj=ip.p*pValues.size()/rank;
200         };
201         for (auto ip=pValues.rbegin()+1; ip!=pValues.rend(); ++ip)
202             ip->padj = min(ip->padj, (ip-1)->padj); //make it non-decreasing
203     };
204 
205     uint32 extraCells=0;
206     for (auto &ip: pValues) {
207         if (ip.padj<=pSolo.cellFilter.eDcr.FDR) {
208             ++extraCells;
209             filteredCells.filtVecBool[ip.index]=true;
210         };
211     };
212     time(&rawTime);
213     P.inOut->logMain << timeMonthDayTime(rawTime) <<  " ... finished emptyDrops_CR filtering: number of additional non-ambient cells=" << extraCells <<endl;
214 
215     return;
216 };
217 
logMultinomialPDFsparse(const vector<double> & ambProfileLogP,const vector<uint32> & countCellGeneUMI,const uint32 stride,const uint32 shift,const int64 start,const uint32 nGenes,const vector<double> & logFactorial)218 double logMultinomialPDFsparse(const vector<double> &ambProfileLogP, const vector<uint32> &countCellGeneUMI, const uint32 stride, const uint32 shift, const int64 start, const uint32 nGenes, const vector<double> &logFactorial)
219 {
220     uint32 sumCount=0;
221     double sumLogFac=0.0, sumCountLogP=0.0;
222     for (uint32 ig=0; ig<nGenes; ig++) {
223         auto count1 = countCellGeneUMI[start+ig*stride+shift];
224         sumCount += count1;
225         sumLogFac += logFactorial[count1];
226         sumCountLogP += ambProfileLogP[countCellGeneUMI[start+ig*stride]] * count1;
227     };
228 
229     return logFactorial[sumCount] - sumLogFac + sumCountLogP;
230 };