1 #include "SoloFeature.h"
2 #include "streamFuns.h"
3 #include "TimeFunctions.h"
4 #include "SequenceFuns.h"
5 #include "serviceFuns.cpp"
6 #include "soloInputFeatureUMI.h"
7
8
countSmartSeq()9 void SoloFeature::countSmartSeq()
10 {
11 time_t rawTime;
12
13 //nCB=pSolo.cbWLsize; //all cells are recorded
14 //nCB, indCB are recorded in sumThreads
15
16 for (int ii=0; ii<P.runThreadN; ii++) {
17 readFeatSum->addStats(*readFeatAll[ii]);
18 };
19
20 redistributeReadsByCB();
21
22 time(&rawTime);
23 P.inOut->logMain << timeMonthDayTime(rawTime) <<" ... Finished redistribution of reads from Solo read files"<<endl;
24
25 //////////////////////////////////////////////////////
26 nReadPerCB.resize(nCB);
27 cbFeatureUMImap.resize(nCB);
28
29 typedef struct {
30 uint32 feature;
31 uint32 count[2];//[0]=NoDedup, [1]=Exact
32 } typeFeatureCount;
33
34 vector<vector<typeFeatureCount>> vCellFeatureCount(nCB);
35
36 typedef struct {
37 uint32 feature;
38 uint64 umi;
39 } typeFeatureUMI;
40 vector<vector<typeFeatureUMI>::iterator> cbFeatUMI (nCB + 1);
41
42 #pragma omp parallel for num_threads(P.runThreadN)
43 for (uint32 ired=0; ired<redistrFilesCBfirst.size()-1; ired++) {//TODO: parallelize, each ired is independent here!
44 vector<typeFeatureUMI> vFeatureUMI (redistrFilesNreads[ired]);
45
46 uint32 iCB1=redistrFilesCBfirst[ired];
47 uint32 iCB2=redistrFilesCBfirst[ired+1];
48
49 //allocate arrays
50 cbFeatUMI[iCB1]=vFeatureUMI.begin();
51 for (uint32 icb=iCB1+1; icb<iCB2; icb++) {
52 cbFeatUMI[icb] = cbFeatUMI[icb-1] + readFeatSum->cbReadCount[indCB[icb-1]];
53 };
54
55 //input records
56 redistrFilesStreams[ired]->flush();
57 redistrFilesStreams[ired]->seekg(0,ios::beg);
58
59 uint32 feature;
60 uint64 umi, iread;
61 int32 cbmatch;
62 int64 cb;
63 vector<uint32> trIdDist; //not used
64 bool readInfoYes=false;
65 while (soloInputFeatureUMI(redistrFilesStreams[ired], featureType, readInfoYes, P.sjAll, iread, cbmatch, feature, umi, trIdDist)) {//cycle over file records
66
67 *redistrFilesStreams[ired] >> cb;
68
69 if (feature+1 == 0)
70 continue;
71
72 uint32 icb=indCBwl[cb];
73 *( cbFeatUMI[icb] + nReadPerCB[icb] )={feature,umi};
74 nReadPerCB[icb]++;
75 };
76
77 //collapse UMI, simple
78 for (uint32 icb=iCB1; icb<iCB2; icb++) {
79 if (nReadPerCB[icb]==0) {
80 continue;
81 };
82
83 sort(cbFeatUMI[icb], cbFeatUMI[icb]+nReadPerCB[icb],
84 [](const typeFeatureUMI &a, const typeFeatureUMI &b)
85 {return (a.feature < b.feature) || ( a.feature == b.feature && a.umi < b.umi); });
86
87 vCellFeatureCount[icb].reserve(8192);
88 vCellFeatureCount[icb].push_back({cbFeatUMI[icb]->feature, {1,1}});//first read
89 for (auto fu=cbFeatUMI[icb]+1; fu!=cbFeatUMI[icb]+nReadPerCB[icb]; fu++) {//cycle over all reads for this icb
90 if ( fu->feature != (fu-1)->feature ) {//compare to previous feature
91 vCellFeatureCount[icb].push_back({fu->feature, {1,1}});//create next feature entry
92 } else {//same feature
93 vCellFeatureCount[icb].back().count[0]++; //non-collapsed UMI count
94
95 if ( fu->umi != (fu-1)->umi ) {//same feature, new umi
96 vCellFeatureCount[icb].back().count[1]++;//collapsed UMI count
97 };
98 };
99 };
100 };
101 };
102
103 time(&rawTime);
104 P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Finished reading / collapsing" <<endl;
105
106 //convert to countCellGeneUMI. TODO - this is not necessary, recode output to use vCellFeatureCount
107 nUMIperCB.resize(nCB);
108 nGenePerCB.resize(nCB);
109
110 countMatStride = pSolo.umiDedup.yes.N + 1;
111
112 uint64 ccgN=0;//total number of entries in the countCellGeneUMI
113 for (auto &cbf : vCellFeatureCount) {
114 ccgN+=cbf.size();
115 };
116 countCellGeneUMI.resize(ccgN*countMatStride);
117 countCellGeneUMIindex.resize(nCB+1);
118 countCellGeneUMIindex[0]=0;
119
120 for (uint32 icb=0; icb<nCB; icb++) {//copy vCellFeatureCount
121
122 nGenePerCB[icb]=vCellFeatureCount[icb].size();
123 countCellGeneUMIindex[icb+1] = countCellGeneUMIindex[icb] + nGenePerCB[icb]*countMatStride;
124
125 for (uint64 ic=countCellGeneUMIindex[icb], ig=0; ic<countCellGeneUMIindex[icb+1]; ic+=countMatStride, ++ig) {//loop over genes
126
127 countCellGeneUMI[ic + 0] = vCellFeatureCount[icb][ig].feature;
128
129 if ( pSolo.umiDedup.yes.NoDedup )
130 countCellGeneUMI[ic + pSolo.umiDedup.countInd.NoDedup] = vCellFeatureCount[icb][ig].count[0];
131 if ( pSolo.umiDedup.yes.Exact )
132 countCellGeneUMI[ic + pSolo.umiDedup.countInd.Exact] = vCellFeatureCount[icb][ig].count[1];//collapsed UMI count
133
134 nUMIperCB[icb] += countCellGeneUMI[ic + pSolo.umiDedup.countInd.main];
135 };
136
137 };
138
139 // sum stats
140 nReadPerCBtotal = nReadPerCB;
141 nReadPerCBunique = nReadPerCB;
142
143 uint32 nReadPerCBmax=0;
144 for (uint32 icb=0; icb<nCB; icb++) {
145
146 nReadPerCBmax=max(nReadPerCBmax,nReadPerCB[icb]);
147 readFeatSum->stats.V[readFeatSum->stats.nMatch] += nReadPerCBtotal[icb];
148 readFeatSum->stats.V[readFeatSum->stats.nMatchUnique] += nReadPerCBunique[icb];
149
150 readFeatSum->stats.V[readFeatSum->stats.nUMIs] += nUMIperCB[icb];
151 if (nGenePerCB[icb]>0)
152 ++readFeatSum->stats.V[readFeatSum->stats.nCellBarcodes];
153 };
154
155 readFeatSum->stats.V[readFeatSum->stats.nExactMatch]=readFeatSum->stats.V[readFeatSum->stats.nMatch];
156
157 time(&rawTime);
158 P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Finished SmartSeq counting" <<endl;
159 };
160