1 #include "SoloFeature.h"
2 #include "streamFuns.h"
3 #include "TimeFunctions.h"
4 #include "SequenceFuns.h"
5 #include "serviceFuns.cpp"
6 #include "soloInputFeatureUMI.h"
7 
8 
countSmartSeq()9 void SoloFeature::countSmartSeq()
10 {
11     time_t rawTime;
12 
13     //nCB=pSolo.cbWLsize; //all cells are recorded
14     //nCB, indCB are recorded in sumThreads
15 
16     for (int ii=0; ii<P.runThreadN; ii++) {
17         readFeatSum->addStats(*readFeatAll[ii]);
18     };
19 
20     redistributeReadsByCB();
21 
22     time(&rawTime);
23     P.inOut->logMain << timeMonthDayTime(rawTime) <<" ... Finished redistribution of reads from Solo read files"<<endl;
24 
25     //////////////////////////////////////////////////////
26     nReadPerCB.resize(nCB);
27     cbFeatureUMImap.resize(nCB);
28 
29     typedef struct {
30         uint32 feature;
31         uint32 count[2];//[0]=NoDedup, [1]=Exact
32     } typeFeatureCount;
33 
34     vector<vector<typeFeatureCount>> vCellFeatureCount(nCB);
35 
36     typedef struct {
37         uint32 feature;
38         uint64 umi;
39     } typeFeatureUMI;
40     vector<vector<typeFeatureUMI>::iterator> cbFeatUMI (nCB + 1);
41 
42     #pragma omp parallel for num_threads(P.runThreadN)
43     for (uint32 ired=0; ired<redistrFilesCBfirst.size()-1; ired++) {//TODO: parallelize, each ired is independent here!
44         vector<typeFeatureUMI> vFeatureUMI (redistrFilesNreads[ired]);
45 
46         uint32 iCB1=redistrFilesCBfirst[ired];
47         uint32 iCB2=redistrFilesCBfirst[ired+1];
48 
49         //allocate arrays
50         cbFeatUMI[iCB1]=vFeatureUMI.begin();
51         for (uint32 icb=iCB1+1; icb<iCB2; icb++) {
52             cbFeatUMI[icb] = cbFeatUMI[icb-1] + readFeatSum->cbReadCount[indCB[icb-1]];
53         };
54 
55         //input records
56         redistrFilesStreams[ired]->flush();
57         redistrFilesStreams[ired]->seekg(0,ios::beg);
58 
59         uint32 feature;
60         uint64 umi, iread;
61         int32 cbmatch;
62         int64 cb;
63         vector<uint32> trIdDist; //not used
64         bool readInfoYes=false;
65         while (soloInputFeatureUMI(redistrFilesStreams[ired], featureType, readInfoYes, P.sjAll, iread, cbmatch, feature, umi, trIdDist)) {//cycle over file records
66 
67             *redistrFilesStreams[ired] >> cb;
68 
69             if (feature+1 == 0)
70                 continue;
71 
72             uint32 icb=indCBwl[cb];
73             *( cbFeatUMI[icb] + nReadPerCB[icb] )={feature,umi};
74             nReadPerCB[icb]++;
75         };
76 
77         //collapse UMI, simple
78         for (uint32 icb=iCB1; icb<iCB2; icb++) {
79             if (nReadPerCB[icb]==0) {
80                 continue;
81             };
82 
83             sort(cbFeatUMI[icb], cbFeatUMI[icb]+nReadPerCB[icb],
84                     [](const typeFeatureUMI &a, const typeFeatureUMI &b)
85                       {return (a.feature < b.feature) || ( a.feature == b.feature && a.umi < b.umi); });
86 
87             vCellFeatureCount[icb].reserve(8192);
88             vCellFeatureCount[icb].push_back({cbFeatUMI[icb]->feature, {1,1}});//first read
89             for (auto fu=cbFeatUMI[icb]+1; fu!=cbFeatUMI[icb]+nReadPerCB[icb]; fu++) {//cycle over all reads for this icb
90                 if ( fu->feature != (fu-1)->feature ) {//compare to previous feature
91                     vCellFeatureCount[icb].push_back({fu->feature, {1,1}});//create next feature entry
92                 } else {//same feature
93                     vCellFeatureCount[icb].back().count[0]++; //non-collapsed UMI count
94 
95                     if ( fu->umi != (fu-1)->umi ) {//same feature, new umi
96                         vCellFeatureCount[icb].back().count[1]++;//collapsed UMI count
97                     };
98                 };
99             };
100         };
101     };
102 
103     time(&rawTime);
104     P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Finished reading / collapsing" <<endl;
105 
106     //convert to countCellGeneUMI. TODO - this is not necessary, recode output to use vCellFeatureCount
107     nUMIperCB.resize(nCB);
108     nGenePerCB.resize(nCB);
109 
110     countMatStride = pSolo.umiDedup.yes.N + 1;
111 
112     uint64 ccgN=0;//total number of entries in the countCellGeneUMI
113     for (auto &cbf :  vCellFeatureCount) {
114         ccgN+=cbf.size();
115     };
116     countCellGeneUMI.resize(ccgN*countMatStride);
117     countCellGeneUMIindex.resize(nCB+1);
118     countCellGeneUMIindex[0]=0;
119 
120     for (uint32 icb=0; icb<nCB; icb++) {//copy vCellFeatureCount
121 
122         nGenePerCB[icb]=vCellFeatureCount[icb].size();
123         countCellGeneUMIindex[icb+1] = countCellGeneUMIindex[icb] + nGenePerCB[icb]*countMatStride;
124 
125         for (uint64 ic=countCellGeneUMIindex[icb], ig=0; ic<countCellGeneUMIindex[icb+1]; ic+=countMatStride, ++ig) {//loop over genes
126 
127             countCellGeneUMI[ic + 0] = vCellFeatureCount[icb][ig].feature;
128 
129             if ( pSolo.umiDedup.yes.NoDedup )
130                 countCellGeneUMI[ic + pSolo.umiDedup.countInd.NoDedup] = vCellFeatureCount[icb][ig].count[0];
131             if ( pSolo.umiDedup.yes.Exact )
132                 countCellGeneUMI[ic + pSolo.umiDedup.countInd.Exact] = vCellFeatureCount[icb][ig].count[1];//collapsed UMI count
133 
134             nUMIperCB[icb] += countCellGeneUMI[ic + pSolo.umiDedup.countInd.main];
135         };
136 
137     };
138 
139     // sum stats
140     nReadPerCBtotal = nReadPerCB;
141     nReadPerCBunique = nReadPerCB;
142 
143     uint32 nReadPerCBmax=0;
144     for (uint32 icb=0; icb<nCB; icb++) {
145 
146         nReadPerCBmax=max(nReadPerCBmax,nReadPerCB[icb]);
147         readFeatSum->stats.V[readFeatSum->stats.nMatch] += nReadPerCBtotal[icb];
148         readFeatSum->stats.V[readFeatSum->stats.nMatchUnique] += nReadPerCBunique[icb];
149 
150         readFeatSum->stats.V[readFeatSum->stats.nUMIs] += nUMIperCB[icb];
151         if (nGenePerCB[icb]>0)
152             ++readFeatSum->stats.V[readFeatSum->stats.nCellBarcodes];
153     };
154 
155     readFeatSum->stats.V[readFeatSum->stats.nExactMatch]=readFeatSum->stats.V[readFeatSum->stats.nMatch];
156 
157     time(&rawTime);
158     P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Finished SmartSeq counting" <<endl;
159 };
160