1 #include "Transcriptome.h"
2 #include "streamFuns.h"
3 #include "GlobalVariables.h"
4 #include "ErrorWarning.h"
5 #include "serviceFuns.cpp"
6 
Transcriptome(Parameters & Pin)7 Transcriptome::Transcriptome (Parameters &Pin) : P(Pin){
8 
9     if (!P.quant.yes)
10         return;
11 
12     trInfoDir = P.pGe.sjdbGTFfile=="-" ? P.pGe.gDir : P.sjdbInsert.outDir; //if GTF file is given at the mapping stage, it's always used for transcript info
13 
14     ifstream &geStream = ifstrOpen(trInfoDir+"/geneInfo.tab", ERROR_OUT, "SOLUTION: utilize --sjdbGTFfile /path/to/annotations.gtf option at the genome generation step or mapping step", P);
15     geStream >> nGe;
16     geID.resize(nGe);
17     geName.resize(nGe);
18     geBiotype.resize(nGe);
19     geStream.ignore(999,'\n');
20     for (uint ii=0;ii<nGe;ii++) {
21         string line1;
22         getline(geStream,line1);
23         istringstream stream1(line1);
24         stream1 >> geID[ii] >> geName[ii] >> geBiotype[ii];
25     };
26     geStream.close();
27 
28     if ( P.quant.trSAM.yes || P.quant.gene.yes ) {//load exon-transcript structures
29         //load tr and ex info
30         ifstream & trinfo = ifstrOpen(trInfoDir+"/transcriptInfo.tab", ERROR_OUT, "SOLUTION: utilize --sjdbGTFfile /path/to/annotantions.gtf option at the genome generation step or mapping step",P);
31         trinfo >> nTr;
32         trS=new uint [nTr];
33         trE=new uint [nTr];
34         trEmax=new uint [nTr];
35         trExI=new uint32 [nTr];
36         trExN=new uint16 [nTr];
37         trStr=new uint8 [nTr];
38         trID.resize(nTr);
39         trGene=new uint32 [nTr];
40         trLen=new uint32 [nTr];
41 
42         for (uint32 itr=0; itr<nTr; itr++) {
43             uint16 str1;
44             trinfo >> trID[itr] >> trS[itr] >> trE[itr] >> trEmax[itr] >> str1 >> trExN[itr] >> trExI[itr] >> trGene[itr];
45             trStr[itr]=str1;
46 
47             if (!trinfo.good()) {
48                 ostringstream errOut;
49                 errOut <<"EXITING because of FATAL GENOME INDEX FILE error: transcriptInfo.tab is corrupt, or is incompatible with the current STAR version\n";
50                 errOut <<"SOLUTION: re-generate genome index";
51                 exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_GENOME_FILES, P);
52             };
53 
54         };
55         P.inOut->logMain << "Loaded transcript database, nTr="<<nTr<<endl;
56         trinfo.close();
57 
58         ifstream & exinfo = ifstrOpen(trInfoDir+"/exonInfo.tab", ERROR_OUT, "SOLUTION: utilize --sjdbGTFfile /path/to/annotantions.gtf option at the genome generation step or mapping step", P);
59         exinfo >> nEx;
60         exSE = new uint32 [2*nEx];
61         exLenCum = new uint32 [nEx];
62         for (uint32 iex=0; iex<nEx; iex++) {
63             exinfo >> exSE[2*iex] >> exSE[2*iex+1] >> exLenCum[iex]; //reading all elements one after another
64         };
65         for (uint32 ii=0;ii<nTr;ii++) {
66             uint32 iex1=trExI[ii]+trExN[ii]-1; //last exon of the transcript
67             trLen[ii]=exLenCum[iex1]+exSE[2*iex1+1]-exSE[2*iex1]+1;
68         };
69 
70         P.inOut->logMain << "Loaded exon database, nEx="<<nEx<<endl;
71         exinfo.close();
72     };
73     //load exon-gene structures
74     if ( P.quant.geCount.yes ) {
75         ifstream & exinfo = ifstrOpen(trInfoDir+"/exonGeTrInfo.tab", ERROR_OUT, "SOLUTION: utilize --sjdbGTFfile /path/to/annotantions.gtf option at the genome generation step or mapping step", P);
76         exinfo >> exG.nEx;
77         exG.s=new uint64[exG.nEx];
78         exG.e=new uint64[exG.nEx];
79         exG.eMax=new uint64[exG.nEx];
80         exG.str=new uint8[exG.nEx];
81         exG.g=new uint32[exG.nEx];
82         exG.t=new uint32[exG.nEx];
83         for (uint ii=0;ii<exG.nEx;ii++) {
84             int str1;
85             exinfo >> exG.s[ii] >> exG.e[ii] >> str1 >> exG.g[ii] >> exG.t[ii];
86             exG.str[ii] = (uint8) str1;
87         };
88         exinfo.close();
89         //calculate eMax
90         exG.eMax[0]=exG.e[0];
91         for (uint iex=1;iex<exG.nEx;iex++) {
92             exG.eMax[iex]=max(exG.eMax[iex-1],exG.e[iex]);
93         };
94     };
95 
96     if ( P.quant.geneFull.yes ) {
97         ifstream & exinfo = ifstrOpen(trInfoDir+"/exonGeTrInfo.tab", ERROR_OUT, "SOLUTION: utilize --sjdbGTFfile /path/to/annotantions.gtf option at the genome generation step or mapping step", P);
98         exinfo >> exG.nEx;
99 
100         geneFull.s=new uint64[nGe];
101         geneFull.e=new uint64[nGe];
102         geneFull.eMax=new uint64[nGe];
103         geneFull.g=new uint32[nGe];
104         geneFull.str=new uint8[nGe];
105 
106         for (uint ig=0;ig<nGe;ig++) {
107             geneFull.s[ig]=-1;//largest uint64
108             geneFull.e[ig]=0;
109         };
110 
111         for (uint ii=0;ii<exG.nEx;ii++) {
112             uint64 s1,e1,str1,g1,t1;
113             exinfo >> s1 >> e1 >> str1 >> g1 >> t1;
114             geneFull.s[g1]=min(geneFull.s[g1],s1);
115             geneFull.e[g1]=max(geneFull.e[g1],e1);
116             geneFull.str[g1] = (uint8) str1;
117         };
118         exinfo.close();
119 
120         uint64 *gF=new uint64 [4*nGe];
121         for (uint ii=0;ii<nGe;ii++) {
122             gF[4*ii]   = geneFull.s[ii];
123             gF[4*ii+1] = geneFull.e[ii];
124             gF[4*ii+2] = geneFull.str[ii];
125             gF[4*ii+3] = ii;
126         };
127 
128         qsort((void*) gF, nGe, sizeof(uint64)*4, funCompareArrays<uint64,2>);
129 
130         for (uint ii=0;ii<nGe;ii++) {
131             geneFull.s[ii]   = gF[4*ii];
132             geneFull.e[ii]   = gF[4*ii+1];
133             geneFull.str[ii] = gF[4*ii+2];
134             geneFull.g[ii]   = gF[4*ii+3];
135         };
136 
137         //calculate eMax
138         geneFull.eMax[0]=geneFull.e[0];
139         for (uint iex=1;iex<nGe;iex++) {
140             geneFull.eMax[iex]=max(geneFull.eMax[iex-1],geneFull.e[iex]);
141         };
142     };
143 
144 };
145 
quantsAllocate()146 void Transcriptome::quantsAllocate() {
147     if ( P.quant.geCount.yes ) {
148         quants = new Quantifications (nGe);
149     };
150 };
151 
quantsOutput()152 void Transcriptome::quantsOutput() {
153     ofstream qOut(P.quant.geCount.outFile);
154     qOut << "N_unmapped";
155     for (int itype=0; itype<quants->geneCounts.nType; itype++) {
156         qOut << "\t" <<g_statsAll.unmappedAll;
157     };
158     qOut << "\n";
159 
160     qOut << "N_multimapping";
161     for (int itype=0; itype<quants->geneCounts.nType; itype++){
162         qOut << "\t" <<quants->geneCounts.cMulti;
163     };
164     qOut << "\n";
165 
166     qOut << "N_noFeature";
167     for (int itype=0; itype<quants->geneCounts.nType; itype++){
168         qOut << "\t" <<quants->geneCounts.cNone[itype];
169     };
170     qOut << "\n";
171 
172     qOut << "N_ambiguous";
173     for (int itype=0; itype<quants->geneCounts.nType; itype++) {
174         qOut << "\t" <<quants->geneCounts.cAmbig[itype];
175     };
176     qOut << "\n";
177 
178     for (uint32 ig=0; ig<nGe; ig++) {
179         qOut << geID[ig];
180         for (int itype=0; itype<quants->geneCounts.nType; itype++) {
181             qOut << "\t" <<quants->geneCounts.gCount[itype][ig];
182         };
183         qOut << "\n";
184     };
185     qOut.close();
186 };
187