1 #include "SoloFeature.h"
2 #include "serviceFuns.cpp"
3 #include "SimpleGoodTuring/sgt.h"
4 #include <math.h>
5 #include <unordered_set>
6 #include <map>
7 #include <random>
8
9 double logMultinomialPDFsparse(const vector<double> &ambProfileLogP, const vector<uint32> &countCellGeneUMI, const uint32 stride, const uint32 shift, const int64 start, const uint32 nGenes, const vector<double> &logFactorial);
emptyDrops_CR()10 void SoloFeature::emptyDrops_CR()
11 {
12 if (nCB<=pSolo.cellFilter.eDcr.indMin) {
13 P.inOut->logMain << "emptyDrops_CR filtering: no empty cells found: nCB=" << nCB <<" emptyCellMinIndex="<< pSolo.cellFilter.eDcr.indMin << "\n";
14 return;
15 };
16
17 time_t rawTime;
18 time(&rawTime);
19 P.inOut->logMain << timeMonthDayTime(rawTime) <<" ... starting emptyDrops_CR filtering" <<endl;
20
21 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
22 //find genes that were detected in all cells
23 unordered_set<uint32> featDet;
24 for (uint32 icb=0; icb<nCB; icb++) {
25 for (uint32 ig=0; ig<nGenePerCB[icb]; ig++) {
26 uint32 irec=countCellGeneUMIindex[icb]+ig*countMatStride;
27 if (countCellGeneUMI[irec + pSolo.umiDedup.countInd.main] > 0)
28 featDet.insert(countCellGeneUMI[irec]); //gene is present if it's count > 0 for
29 };
30 };
31 uint32 featDetN=featDet.size(); //total number of detected genes - this should have been done already?
32
33 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
34 //indCount - total UMI per cell sorted descending
35 typedef struct {uint32 index, count;} IndCount;
36 vector<IndCount> indCount(nCB);
37 for (uint32 ii=0; ii<nCB; ii++) {
38 indCount[ii].index=ii;
39 indCount[ii].count=nUMIperCB[ii];
40 };
41 std::sort(indCount.begin(), indCount.end(), [](const IndCount &ic1, const IndCount &ic2) {
42 return (ic1.count>ic2.count) || (ic1.count==ic2.count && ic1.index<ic2.index); //descending order by count, ascending by index
43 });
44
45 //////////////////////////////////////////////////////////////////////////////////////////
46 //ambient gene counts: sum gene expression over the collection of empty cells
47 vector<uint32> ambCount(featuresNumber,0);
48 for (auto icb=pSolo.cellFilter.eDcr.indMin; icb<min(nCB,pSolo.cellFilter.eDcr.indMax); icb++) {
49 auto icb1 = indCount[icb].index;
50 for (uint32 ig=0; ig<nGenePerCB[icb1]; ig++) {
51 auto irec = countCellGeneUMIindex[icb1]+ig*countMatStride;
52 ambCount[countCellGeneUMI[irec+0]] += countCellGeneUMI[irec + pSolo.umiDedup.countInd.main];
53 };
54 };
55 time(&rawTime);
56 P.inOut->logMain << timeMonthDayTime(rawTime) <<" ... finished ambient cells counting" <<endl;
57
58 ////////////////////////////////////////////////////////////////////
59 //frequencies
60 map<uint32,uint32> ambCountFreq; //ordered map is not really needed
61 for (auto &ac: ambCount) {
62 ambCountFreq[ac]++;
63 };
64 if (ambCountFreq.size()<=1) {//only 0-frequency genes are in the empty cells. This is possible because nCB can contain ome cells with no genes - because of multigene
65 P.inOut->logMain << "emptyDrops_CR filtering: empty cells contain no genes\n";
66 return;
67 };
68 ambCountFreq[0] -= (featuresNumber-featDetN); //subtract genes that were not detected in *any* cells
69 uint32 maxFreq = ambCountFreq.rbegin()->first;
70
71 ///////////////////////////////////////////////////////////////////////
72 //SGT
73 vector<double> ambCountFreqSGT(maxFreq+1);//up to max frequency
74 {//SGT estimate of ambient profile
75 SGT<uint32> sgt;
76 for (auto &cf: ambCountFreq) {
77 if (cf.first != 0)
78 sgt.add(cf.first, cf.second);
79 };
80 sgt.analyse();
81
82 for (uint32 freq=0; freq<=maxFreq; freq++) {
83 sgt.estimate(freq, ambCountFreqSGT[freq]);
84 };
85 ambCountFreqSGT[0] /= ambCountFreq[0]; //divide freq=0 probability equally among all undetected genes in ambient profile
86 };
87 time(&rawTime);
88 P.inOut->logMain << timeMonthDayTime(rawTime) <<" ... finished SGT"<<endl;
89
90 //ambient profile for all features
91 vector<double> ambProfileLogP(featuresNumber, 0.0);//logarithm
92 vector<double> ambProfilePnon0, ambProfileLogPnon0;//only non-0 genes
93 {
94 for (uint32 ig=0; ig<featuresNumber; ig++) {
95 if (featDet.count(ig)>0) {//this is only needed if normalization below is performed
96 ambProfileLogP[ig]=ambCountFreqSGT[ambCount[ig]];
97 };
98 };
99
100 double norm1 = accumulate(ambProfileLogP.begin(), ambProfileLogP.end(), 0.0);
101 ambProfileLogPnon0.reserve(ambProfileLogP.size());
102 ambProfilePnon0.reserve(ambProfileLogP.size());
103 for (auto &cf: ambProfileLogP) {
104 if (cf>0) {
105 cf /= norm1;//normalization is just in case
106 ambProfilePnon0.push_back(cf);
107 cf = std::log(cf);
108 ambProfileLogPnon0.push_back(cf);
109 };
110 };
111 };
112
113 time(&rawTime);
114 P.inOut->logMain << timeMonthDayTime(rawTime) <<" ... finished ambient profile"<<endl;
115
116 //select candidate cells
117 uint32 iCandFirst, iCandLast; //first/last candidate cell in the descending sorted indCount
118 {
119 iCandFirst=filteredCells.nCellsSimple;//candidates start right after the cutoff for the simple filtering
120 uint32 minUMI = int(pSolo.cellFilter.eDcr.umiMinFracMedian * nUMIperCBsorted[filteredCells.nCellsSimple/2]);//this is not exactly median
121 minUMI = max(pSolo.cellFilter.eDcr.umiMin, minUMI);
122 for (iCandLast=iCandFirst; iCandLast<iCandFirst+pSolo.cellFilter.eDcr.candMaxN; iCandLast++) {
123 if (indCount[iCandLast].count<minUMI)
124 break;
125 };
126 --iCandLast;
127
128 time(&rawTime);
129 P.inOut->logMain << timeMonthDayTime(rawTime) << " ... candidate cells: minUMI="<< minUMI << "; number of candidate cells=" << iCandLast-iCandFirst+1 <<endl;
130 if (iCandLast<iCandFirst)
131 return; //no candidate cells to consider
132 };
133
134 //calculate observed probability for each candidate
135 vector<double> obsLogProb(iCandLast-iCandFirst+1);
136 {
137 vector<double> logFactorial; //tabulate log-factorial
138 logFactorial.resize(indCount[iCandFirst].count+1);
139 logFactorial[1]=0;
140 for (uint32 cc=2; cc<logFactorial.size(); cc++)
141 logFactorial[cc]=logFactorial[cc-1]+std::log(cc);
142
143 for (uint32 icand=0; icand<obsLogProb.size(); icand++) {
144 auto icell=indCount[icand+iCandFirst].index;
145 obsLogProb[icand]=logMultinomialPDFsparse(ambProfileLogP, countCellGeneUMI, countMatStride, pSolo.umiDedup.countInd.main, countCellGeneUMIindex[icell], nGenePerCB[icell], logFactorial);
146 };
147 time(&rawTime);
148 }
149 P.inOut->logMain << timeMonthDayTime(rawTime) << " ... finished observed logProb" <<endl;
150
151 //simulate the probabilities for each cell count
152 vector<vector<double>> simLogProb(pSolo.cellFilter.eDcr.simN);
153 {
154 std::discrete_distribution<uint32> distrAmb ( ambProfilePnon0.begin(), ambProfilePnon0.end() );
155 auto maxCount=indCount[iCandFirst].count;
156
157 //#pragma omp parallel for num_threads(P.runThreadN) //does not increase speed significantly - might be useful for larger number of simulations
158 for (uint64 isim=0; isim<simLogProb.size(); isim++) {
159 simLogProb[isim].resize(maxCount+1);
160 simLogProb[isim][0]=0;
161
162 std::mt19937 rngGen(19760110LLU*(isim+1));
163
164 vector<uint32> currCounts(ambProfilePnon0.size(), 0);
165 for (uint32 ic=1; ic<=maxCount; ic++) {
166 uint32 ig1 = distrAmb(rngGen);
167 currCounts[ig1]++;
168 simLogProb[isim][ic] = simLogProb[isim][ic-1] + ambProfileLogPnon0[ig1] + std::log(ic) - std::log(currCounts[ig1]);
169 };
170 };
171 };
172 time(&rawTime);
173 P.inOut->logMain << timeMonthDayTime(rawTime) << " ... finished simulations" <<endl;
174
175
176 //p-values
177 typedef struct{uint32 index; double p; double padj;} IndPPadj;
178 vector<IndPPadj> pValues(obsLogProb.size());
179 {
180 for (uint32 icand=0; icand<obsLogProb.size(); icand++) {
181 pValues[icand].index=indCount[icand+iCandFirst].index;
182 auto count1=indCount[icand+iCandFirst].count;
183
184 //auto funSumLess = [&] (uint32 n, vector<double> sp) { return n + (sp[count1]<obsLogProb[icand]); };
185 //uint32 nLowerP = std::accumulate<uint32>(simLogProb.begin(), simLogProb.end(), 0, funSumLess);
186 uint32 nLowerP=0;
187 //for (uint64 isim=0; isim<simLogProb.size(); isim++) {
188 for (auto &sp: simLogProb) {
189 nLowerP += ( sp[count1]<obsLogProb[icand] );
190 };
191
192 pValues[icand].p=double(1+nLowerP)/(1+simLogProb.size());
193 };
194 //BH
195 std::sort(pValues.begin(), pValues.end(), [](const IndPPadj &ip1, const IndPPadj &ip2) {return (ip1.p < ip2.p);} );
196 uint32 rank=0;
197 for (auto &ip: pValues) {
198 rank++;
199 ip.padj=ip.p*pValues.size()/rank;
200 };
201 for (auto ip=pValues.rbegin()+1; ip!=pValues.rend(); ++ip)
202 ip->padj = min(ip->padj, (ip-1)->padj); //make it non-decreasing
203 };
204
205 uint32 extraCells=0;
206 for (auto &ip: pValues) {
207 if (ip.padj<=pSolo.cellFilter.eDcr.FDR) {
208 ++extraCells;
209 filteredCells.filtVecBool[ip.index]=true;
210 };
211 };
212 time(&rawTime);
213 P.inOut->logMain << timeMonthDayTime(rawTime) << " ... finished emptyDrops_CR filtering: number of additional non-ambient cells=" << extraCells <<endl;
214
215 return;
216 };
217
logMultinomialPDFsparse(const vector<double> & ambProfileLogP,const vector<uint32> & countCellGeneUMI,const uint32 stride,const uint32 shift,const int64 start,const uint32 nGenes,const vector<double> & logFactorial)218 double logMultinomialPDFsparse(const vector<double> &ambProfileLogP, const vector<uint32> &countCellGeneUMI, const uint32 stride, const uint32 shift, const int64 start, const uint32 nGenes, const vector<double> &logFactorial)
219 {
220 uint32 sumCount=0;
221 double sumLogFac=0.0, sumCountLogP=0.0;
222 for (uint32 ig=0; ig<nGenes; ig++) {
223 auto count1 = countCellGeneUMI[start+ig*stride+shift];
224 sumCount += count1;
225 sumLogFac += logFactorial[count1];
226 sumCountLogP += ambProfileLogP[countCellGeneUMI[start+ig*stride]] * count1;
227 };
228
229 return logFactorial[sumCount] - sumLogFac + sumCountLogP;
230 };