1 #ifndef H_SoloFeature
2 #define H_SoloFeature
3 
4 #include <fstream>
5 #include <unordered_map>
6 #include <unordered_set>
7 
8 #include "IncludeDefine.h"
9 #include "ReadAlignChunk.h"
10 #include "Transcriptome.h"
11 
12 #include "SoloCommon.h"
13 #include "SoloRead.h"
14 #include "ReadAlignChunk.h"
15 
16 #include "SoloFilteredCells.h"
17 
18 class SoloFeature {
19 private:
20     Parameters &P;
21     ReadAlignChunk **RAchunk;
22     Transcriptome &Trans;
23 
24     const int32 featureType;
25     SoloFeature **soloFeatAll;
26 
27     static const uint32 umiArrayStride=3;
28     enum {rguG, rguU, rguR};
29     uint32 rguStride;
30 
31 public:
32     ParametersSolo &pSolo;
33 
34     SoloReadFeature *readFeatSum, **readFeatAll;
35     SoloReadBarcode *readBarSum;
36 
37     uint64 nReadsMapped, nReadsInput; //total number of mapped reads
38     uint32 nCB;
39     uint32 featuresNumber; //number of features (i.e. genes, SJs, etc)
40 
41     uint32 *rGeneUMI;//mapped reads sorted by CB
42     uint32 *rCBn;//number of reads for detected CBs in the whitelist
43     uint32 **rCBp;//array of pointers to each CB sub-array
44 
45     vector<uint32> indCB;//index of detected CBs in the whitelist
46     vector<uint32> indCBwl; //reverse of indCB: index of WL CBs in detected CB list
47     vector<uint32> nUMIperCB, nUMIperCBsorted;//number of UMIs per CB, and the same sorted (descendant)
48     vector<uint32> nGenePerCB;//number of genes (with >0 UMIs) per CB
49     vector<uint32> nReadPerCB;//number of reads per CB. With multimappers: all aligns per CB
50     vector<uint32> nReadPerCBunique, nReadPerCBtotal; //number of unique and multiple reads per CB
51 
52     vector<uint32> countCellGeneUMI;//sparsified matrix for the counts, each entry is: geneID count1 count2 ... countNcounts
53     vector<uint32> countCellGeneUMIindex;//index of CBs in the count matrix
54     uint32 countMatStride; //number of counts per entry in the count matrix
55 
56     struct {
57         vector<double> m;
58         vector<uint32> i;
59         uint32 s;
60     } countMatMult;
61 
62     vector<unordered_map<uint32, unordered_set<uint64>>> cbFeatureUMImap; //for SmartSeq counting
63 
64     string outputPrefix, outputPrefixFiltered;
65 
66     SoloFilteredCells filteredCells;
67 
68     array<vector<uint64>,2> sjAll;
69 
70     vector<readInfoStruct> readInfo; //corrected CB/UMI information for each read
71 
72     vector<uint32> redistrFilesCBindex, redistrFilesCBfirst; //redistr file for each CB, CB boundaries in redistributed files
73     vector<uint64> redistrFilesNreads; //number of reads in each file
74     vector <fstream*> redistrFilesStreams;
75 
76     SoloFeature(Parameters &Pin, ReadAlignChunk **RAchunk, Transcriptome &inTrans, int32 feTy, SoloReadBarcode *readBarSumIn, SoloFeature **soloFeatAll);
77     void processRecords();
78     void sumThreads();
79     void countSmartSeq();
80     void countCBgeneUMI();
81     void countVelocyto();
82     void quantTranscript();
83 
84     void collapseUMI(uint32 iCB, uint32 *umiArray);
85     void collapseUMI_CR(uint32 iCB, uint32 *umiArray);
86     void collapseUMIall(uint32 iCB, uint32 *umiArray);
87     uint32 umiArrayCorrect_CR         (const uint32 nU0, uintUMI *umiArr, const bool readInfoRec, const bool nUMIyes, unordered_map <uintUMI,uintUMI> &umiCorr);
88     uint32 umiArrayCorrect_Directional(const uint32 nU0, uintUMI *umiArr, const bool readInfoRec, const bool nUMIyes, unordered_map <uintUMI,uintUMI> &umiCorr, const int32 dirCountAdd);
89     uint32 umiArrayCorrect_Graph      (const uint32 nU0, uintUMI *umiArr, const bool readInfoRec, const bool nUMIyes, unordered_map <uintUMI,uintUMI> &umiCorr);
90 
91     void outputResults(bool cellFilterYes, string outputPrefixMat);
92     void addBAMtags(char *&bam0, uint32 &size0, char* bam1);
93     void statsOutput();
94     void redistributeReadsByCB();
95 
96     void cellFiltering();
97     void emptyDrops_CR();
98     void loadRawMatrix();
99 };
100 
101 #endif
102