1 #include "Genome.h"
2 #include "SuffixArrayFuns.h"
3 #include "PackedArray.h"
4 #include "ErrorWarning.h"
5 #include "streamFuns.h"
6 #include "SharedMemory.h"
7 #include "genomeScanFastaFiles.h"
8
9 //addresses with respect to shmStart of several genome values
10 #define SHM_sizeG 0
11 #define SHM_sizeSA 8
12 #define SHM_startG 16
13 // #define SHM_startSA 24
14 //
15 // //first available byt of the shm
16 // #define SHM_startSHM 32
17
genomeLoad()18 void Genome::genomeLoad(){//allocate and load Genome
19
20 time_t rawtime;
21 time ( &rawtime );
22 *(P.inOut->logStdOut) << timeMonthDayTime(rawtime) << " ..... loading genome\n" <<flush;
23
24 uint *shmNG=NULL, *shmNSA=NULL; //pointers to shm stored values , *shmSG, *shmSSA
25 uint64 shmSize=0;//, shmStartG=0; shmStartSA=0;
26
27 uint L=200,K=6;
28
29 Parameters P1;
30
31 //some initializations before reading the parameters
32 GstrandBit=0;
33
34 ifstream parFile((pGe.gDir+("/genomeParameters.txt")).c_str());
35 if (parFile.good()) {
36 P.inOut->logMain << "Reading genome generation parameters:\n";
37
38 //read genome internal parameters
39 while (parFile.good()) {
40 string word1;
41 parFile >> word1;
42 if (word1=="###") {
43 parFile >> word1;
44 if (word1=="GstrandBit") {
45 uint gsb1=0;
46 parFile >> gsb1;
47 GstrandBit=(uint8) gsb1;
48 P.inOut->logMain << "### GstrandBit=" << (uint) GstrandBit <<"\n";
49 } else {
50 P.inOut->logMain << "### " <<word1;
51 getline(parFile,word1);
52 P.inOut->logMain <<word1<<"\n";
53 };
54 };
55 };
56 parFile.clear();
57 parFile.seekg(0,ios::beg);//rewind
58
59
60 P1.inOut = P.inOut;
61 P1.scanAllLines(parFile,3,-1);
62 parFile.close();
63 } else {
64 ostringstream errOut;
65 errOut << "EXITING because of FATAL ERROR: could not open genome file "<< pGe.gDir+("/genomeParameters.txt") << endl;
66 errOut << "SOLUTION: check that the path to genome files, specified in --genomeDir is correct and the files are present, and have user read permsissions\n" <<flush;
67 exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_GENOME_FILES, P);
68 };
69
70 //check genome version
71 if (P1.versionGenome.size()==0) {//
72 ostringstream errOut;
73 errOut << "EXITING because of FATAL ERROR: read no value for the versionGenome parameter from genomeParameters.txt file\n";
74 errOut << "SOLUTION: please re-generate genome from scratch with the latest version of STAR\n";
75 exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_GENOME_FILES, P);
76 } else if (P1.versionGenome == P.versionGenome) {//
77 P.inOut->logMain << "Genome version is compatible with current STAR\n";
78 } else {
79 ostringstream errOut;
80 errOut << "EXITING because of FATAL ERROR: Genome version: " << P1.versionGenome << " is INCOMPATIBLE with running STAR version: "<< STAR_VERSION <<"\n";
81 errOut << "SOLUTION: please re-generate genome from scratch with running version of STAR, or with version: " << P.versionGenome <<"\n";
82 exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_GENOME_FILES, P);
83 };
84
85 //find chr starts from files
86 chrInfoLoad();
87
88 //check if sjdbInfo.txt exists => genome was generated with junctions
89 bool sjdbInfoExists=false;
90 struct stat sjdb1;
91 if ( stat( (pGe.gDir+"/sjdbInfo.txt").c_str(), &sjdb1) == 0 ) {//file exists
92 sjdbInfoExists=true;
93 };
94
95 if ( P.sjdbInsert.yes && sjdbInfoExists && P1.pGe.sjdbInsertSave=="") {
96 //if sjdbInsert, and genome had junctions, and genome is old - it should be re-generated with new STAR
97 ostringstream errOut;
98 errOut << "EXITING because of FATAL ERROR: old Genome is INCOMPATIBLE with on the fly junction insertion\n";
99 errOut << "SOLUTION: please re-generate genome from scratch with the latest version of STAR\n";
100 exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_GENOME_FILES, P);
101 };
102
103 //record required genome parameters in P
104 pGe.gSAindexNbases=P1.pGe.gSAindexNbases;
105 pGe.gChrBinNbits=P1.pGe.gChrBinNbits;
106 genomeChrBinNbases=1LLU<<pGe.gChrBinNbits;
107 pGe.gSAsparseD=P1.pGe.gSAsparseD;
108
109 if (P1.pGe.gFileSizes.size()>0){//genomeFileSize was recorded in the genomeParameters file, copy the values to P
110 pGe.gFileSizes = P1.pGe.gFileSizes;
111 };
112
113 if (P.parArray.at(pGe.sjdbOverhang_par)->inputLevel==0 && P1.pGe.sjdbOverhang>0) {
114 //if --sjdbOverhang was not defined by user and it was defined >0 at the genome generation step, then use pGe.sjdbOverhang from the genome generation step
115 pGe.sjdbOverhang=P1.pGe.sjdbOverhang;
116 P.inOut->logMain << "--sjdbOverhang = " << pGe.sjdbOverhang << " taken from the generated genome\n";
117 } else if (sjdbInfoExists && P.parArray.at(pGe.sjdbOverhang_par)->inputLevel>0 && pGe.sjdbOverhang!=P1.pGe.sjdbOverhang) {
118 //if pGe.sjdbOverhang was defined at the genome generation step,the mapping step value has to agree with it
119 ostringstream errOut;
120 errOut << "EXITING because of fatal PARAMETERS error: present --sjdbOverhang="<<pGe.sjdbOverhang << " is not equal to the value at the genome generation step ="<< P1.pGe.sjdbOverhang << "\n";
121 errOut << "SOLUTION: \n" <<flush;
122 exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_GENOME_FILES, P);
123 };
124 sjdbOverhang = pGe.sjdbOverhang;
125 sjdbLength = pGe.sjdbOverhang==0 ? 0 : pGe.sjdbOverhang*2+1;
126
127 pGe.gType=1;
128 if (P1.pGe.gTypeString=="SuperTranscriptome")
129 pGe.gType=101;
130
131 P.inOut->logMain << "Started loading the genome: " << asctime (localtime ( &rawtime ))<<"\n"<<flush;
132
133 ifstream GenomeIn, SAin, SAiIn;
134
135 if (pGe.gFileSizes.size() < 2) {//no size info available
136 pGe.gFileSizes.push_back(0);
137 pGe.gFileSizes.push_back(0);
138 };
139 nGenome = OpenStream("Genome",GenomeIn,pGe.gFileSizes.at(0));
140 nSAbyte = OpenStream("SA",SAin,pGe.gFileSizes.at(1));
141 OpenStream("SAindex",SAiIn,1); //we do not need SAiIn siz, using a dummy value here to prevent from reading its size from the disk
142
143 uint SAiInBytes=0;
144 SAiInBytes += fstreamReadBig(SAiIn,(char*) &pGe.gSAindexNbases, sizeof(pGe.gSAindexNbases));
145 genomeSAindexStart = new uint[pGe.gSAindexNbases+1];
146 SAiInBytes += fstreamReadBig(SAiIn,(char*) genomeSAindexStart, sizeof(genomeSAindexStart[0])*(pGe.gSAindexNbases+1));
147 nSAi=genomeSAindexStart[pGe.gSAindexNbases];
148 P.inOut->logMain << "Read from SAindex: pGe.gSAindexNbases=" << pGe.gSAindexNbases <<" nSAi="<< nSAi <<endl;
149
150 /////////////////////////////////// at this point all array sizes should be known: calculate packed array lengths
151 if (GstrandBit==0) {//not defined before
152 GstrandBit = (uint) floor(log(nGenome)/log(2))+1;
153 if (GstrandBit<32)
154 GstrandBit=32; //TODO: use simple access function for SA
155 };
156
157 GstrandMask = ~(1LLU<<GstrandBit);
158 nSA=(nSAbyte*8)/(GstrandBit+1);
159 SA.defineBits(GstrandBit+1,nSA);
160
161 SAiMarkNbit=GstrandBit+1;
162 SAiMarkAbsentBit=GstrandBit+2;
163
164 SAiMarkNmaskC=1LLU << SAiMarkNbit;
165 SAiMarkNmask=~SAiMarkNmaskC;
166 SAiMarkAbsentMaskC=1LLU << SAiMarkAbsentBit;
167 SAiMarkAbsentMask=~SAiMarkAbsentMaskC;
168
169 SAi.defineBits(GstrandBit+3,nSAi);
170
171 P.inOut->logMain << "nGenome=" << nGenome << "; nSAbyte=" << nSAbyte <<endl<< flush;
172 P.inOut->logMain <<"GstrandBit="<<int(GstrandBit)<<" SA number of indices="<<nSA<<endl<<flush;
173
174 shmSize=SA.lengthByte + nGenome+L+L+SHM_startG+8;
175 shmSize+= SAi.lengthByte;
176
177 if ((pGe.gLoad=="LoadAndKeep" ||
178 pGe.gLoad=="LoadAndRemove" ||
179 pGe.gLoad=="LoadAndExit" ||
180 pGe.gLoad=="Remove") && sharedMemory == NULL) {
181
182 bool unloadLast = pGe.gLoad=="LoadAndRemove";
183 try {
184 sharedMemory = new SharedMemory(shmKey, unloadLast);
185 sharedMemory->SetErrorStream(P.inOut->logStdOut);
186
187 if (!sharedMemory->NeedsAllocation())
188 P.inOut->logMain <<"Found genome in shared memory\n"<<flush;
189
190 if (pGe.gLoad=="Remove") {//kill the genome and exit
191 if (sharedMemory->NeedsAllocation()) {//did not find genome in shared memory, nothing to kill
192 ostringstream errOut;
193 errOut << "EXITING: Did not find the genome in memory, did not remove any genomes from shared memory\n";
194 exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_GENOME_FILES, P);
195 } else {
196 sharedMemory->Clean();
197 P.inOut->logMain <<"DONE: removed the genome from shared memory\n"<<flush;
198 return;
199 };
200 };
201
202 if (sharedMemory->NeedsAllocation()){
203 P.inOut->logMain <<"Allocating shared memory for genome\n"<<flush;
204 sharedMemory->Allocate(shmSize);
205 };
206 } catch (const SharedMemoryException & exc){
207 HandleSharedMemoryException(exc, shmSize);
208 };
209
210 shmStart = (char*) sharedMemory->GetMapped();
211 shmNG= (uint*) (shmStart+SHM_sizeG);
212 shmNSA= (uint*) (shmStart+SHM_sizeSA);
213
214 if (!sharedMemory->IsAllocator()) {
215 // genome is in shared memory or being loaded
216 // wait for the process that will populate it
217 // and record the sizes
218 uint iwait=0;
219 while (*shmNG != nGenome) {
220 iwait++;
221 P.inOut->logMain <<"Another job is still loading the genome, sleeping for 1 min\n" <<flush;
222 sleep(60);
223 if (iwait==100) {
224 ostringstream errOut;
225 errOut << "EXITING because of FATAL ERROR: waited too long for the other job to finish loading the genome" << strerror(errno) << "\n" <<flush;
226 errOut << "SOLUTION: remove the shared memory chunk by running STAR with --genomeLoad Remove, and restart STAR" <<flush;
227 exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_GENOME_LOADING_WAITED_TOO_LONG, P);
228 };
229 };
230 if (nSAbyte!=*shmNSA){
231 ostringstream errOut;
232 errOut << "EXITING because of FATAL ERROR: the SA file size did not match what we found in shared memory" << "\n" << flush;
233 errOut << "SOLUTION: remove the shared memory chunk by running STAR with --genomeLoad Remove, and restart STAR" << flush;
234 exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INCONSISTENT_DATA, P);
235 };
236 P.inOut->logMain << "Using shared memory for genome. key=0x" <<hex<<shmKey<<dec<< "; shmid="<< sharedMemory->GetId() <<endl<<flush;
237 };
238 G1=shmStart+SHM_startG;
239 SA.pointArray(G1+nGenome+L+L);
240 char* shmNext=SA.charArray+nSAbyte;
241
242 SAi.pointArray(shmNext);
243 shmNext += SAi.lengthByte;
244 } else if (pGe.gLoad=="NoSharedMemory") {// simply allocate memory, do not use shared memory
245 genomeInsertL=0;
246 genomeInsertChrIndFirst=nChrReal;
247 if (pGe.gFastaFiles.at(0)!="-") {//will insert sequences in the genome, now estimate the extra size
248 uint oldlen=chrStart.back();//record the old length
249 genomeInsertL=genomeScanFastaFiles(P, G, false, *this)-oldlen;
250 };
251
252 try {
253 if (P.sjdbInsert.pass1 || P.sjdbInsert.pass2) {
254 //reserve extra memory for insertion at the 1st and/or 2nd step
255 nGenomeInsert=nGenome+genomeInsertL;
256 nSAinsert=nSA+2*genomeInsertL;
257
258 nGenomePass1=nGenomeInsert;
259 nSApass1=nSAinsert;
260 if (P.sjdbInsert.pass1) {
261 nGenomePass1+=P.limitSjdbInsertNsj*sjdbLength;
262 nSApass1+=2*P.limitSjdbInsertNsj*sjdbLength;
263 };
264
265 nGenomePass2=nGenomePass1;
266 nSApass2=nSApass1;
267 if (P.sjdbInsert.pass2) {
268 nGenomePass2+=P.limitSjdbInsertNsj*sjdbLength;
269 nSApass2+=2*P.limitSjdbInsertNsj*sjdbLength;
270 };
271
272 G1=new char[nGenomePass2+L+L];
273
274 SApass2.defineBits(GstrandBit+1,nSApass2);
275 SApass2.allocateArray();
276
277 SApass1.defineBits(GstrandBit+1,nSApass1);
278 SApass1.pointArray(SApass2.charArray+SApass2.lengthByte-SApass1.lengthByte);
279
280 SAinsert.defineBits(GstrandBit+1,nSAinsert);
281 SAinsert.pointArray(SApass1.charArray+SApass1.lengthByte-SAinsert.lengthByte);
282
283 SA.pointArray(SAinsert.charArray+SAinsert.lengthByte-SA.lengthByte);
284 } else {//no sjdb insertions
285 if (genomeInsertL==0) {// no sequence insertion, simple allocation
286 G1=new char[nGenome+L+L];
287 SA.allocateArray();
288 } else {
289 G1=new char[nGenome+L+L+genomeInsertL];
290 SAinsert.defineBits(GstrandBit+1,nSA+2*genomeInsertL);//TODO: re-define GstrandBit if necessary
291 SAinsert.allocateArray();
292 SA.pointArray(SAinsert.charArray+SAinsert.lengthByte-SA.lengthByte);
293 };
294 };
295 SAi.allocateArray();
296 P.inOut->logMain <<"Shared memory is not used for genomes. Allocated a private copy of the genome.\n"<<flush;
297 } catch (exception & exc) {
298 ostringstream errOut;
299 errOut <<"EXITING: fatal error trying to allocate genome arrays, exception thrown: "<<exc.what()<<endl;
300 errOut <<"Possible cause 1: not enough RAM. Check if you have enough RAM " << nGenome+L+L+SA.lengthByte+SAi.lengthByte+2000000000 << " bytes\n";
301 errOut <<"Possible cause 2: not enough virtual memory allowed with ulimit. SOLUTION: run ulimit -v " << nGenome+L+L+SA.lengthByte+SAi.lengthByte+2000000000<<endl <<flush;
302 exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_MEMORY_ALLOCATION, P);
303 };
304 };
305
306 G=G1+L;
307
308 bool isAllocatorProcess = sharedMemory != NULL && sharedMemory->IsAllocator();
309
310 if (pGe.gLoad=="NoSharedMemory" || isAllocatorProcess) {//load genome and SAs from files
311 //load genome
312 P.inOut->logMain <<"Genome file size: "<<nGenome <<" bytes; state: good=" <<GenomeIn.good()\
313 <<" eof="<<GenomeIn.eof()<<" fail="<<GenomeIn.fail()<<" bad="<<GenomeIn.bad()<<"\n"<<flush;
314 P.inOut->logMain <<"Loading Genome ... " << flush;
315 uint64 genomeReadBytesN=fstreamReadBig(GenomeIn,G,nGenome);
316 P.inOut->logMain <<"done! state: good=" <<GenomeIn.good()\
317 <<" eof="<<GenomeIn.eof()<<" fail="<<GenomeIn.fail()<<" bad="<<GenomeIn.bad()<<"; loaded "<<genomeReadBytesN<<" bytes\n" << flush;
318 GenomeIn.close();
319
320 for (uint ii=0;ii<L;ii++) {// attach a tail with the largest symbol
321 G1[ii]=K-1;
322 G[nGenome+ii]=K-1;
323 };
324
325 //load SAs
326 P.inOut->logMain <<"SA file size: "<<SA.lengthByte <<" bytes; state: good=" <<SAin.good()\
327 <<" eof="<<SAin.eof()<<" fail="<<SAin.fail()<<" bad="<<SAin.bad()<<"\n"<<flush;
328 P.inOut->logMain <<"Loading SA ... " << flush;
329 genomeReadBytesN=fstreamReadBig(SAin,SA.charArray, SA.lengthByte);
330 P.inOut->logMain <<"done! state: good=" <<SAin.good()\
331 <<" eof="<<SAin.eof()<<" fail="<<SAin.fail()<<" bad="<<SAin.bad()<<"; loaded "<<genomeReadBytesN<<" bytes\n" << flush;
332 SAin.close();
333
334 P.inOut->logMain <<"Loading SAindex ... " << flush;
335 SAiInBytes +=fstreamReadBig(SAiIn,SAi.charArray, SAi.lengthByte);
336 P.inOut->logMain <<"done: "<<SAiInBytes<<" bytes\n" << flush;
337 };
338
339 SAiIn.close();
340
341 if ((pGe.gLoad=="LoadAndKeep" ||
342 pGe.gLoad=="LoadAndRemove" ||
343 pGe.gLoad=="LoadAndExit") && isAllocatorProcess )
344 {
345 //record sizes. This marks the end of genome loading
346 *shmNG=nGenome;
347 *shmNSA=nSAbyte;
348 };
349
350 time ( &rawtime );
351 P.inOut->logMain << "Finished loading the genome: " << asctime (localtime ( &rawtime )) <<"\n"<<flush;
352
353 #ifdef COMPILE_FOR_MAC
354 {
355 uint sum1=0;
356 for (uint ii=0;ii<nGenome; ii++) sum1 += (uint) (unsigned char) G[ii];
357 P.inOut->logMain << "Sum of all Genome bytes: " <<sum1 <<"\n"<<flush;
358 sum1=0;
359 for (uint ii=0;ii<SA.lengthByte; ii++) sum1 += (uint) (unsigned char) SA.charArray[ii];
360 P.inOut->logMain << "Sum of all SA bytes: " <<sum1 <<"\n"<<flush;
361 sum1=0;
362 for (uint ii=0;ii<SAi.lengthByte; ii++) sum1 += (uint) (unsigned char) SAi.charArray[ii];
363 P.inOut->logMain << "Sum of all SAi bytes: " <<sum1 <<"\n"<<flush;
364 };
365 #endif
366
367 if (pGe.gLoad=="LoadAndExit") {
368 uint shmSum=0;
369 for (uint ii=0;ii<shmSize;ii++) shmSum+=shmStart[ii];
370 P.inOut->logMain << "pGe.gLoad=LoadAndExit: completed, the genome is loaded and kept in RAM, EXITING now.\n"<<flush;
371 return;
372 };
373
374 Genome::insertSequences();
375
376 Genome::chrBinFill();
377
378 Genome::loadSJDB(pGe.gDir);
379
380 //check and redefine some parameters
381 //max intron size
382 if (P.alignIntronMax==0 && P.alignMatesGapMax==0) {
383 P.inOut->logMain << "alignIntronMax=alignMatesGapMax=0, the max intron size will be approximately determined by (2^winBinNbits)*winAnchorDistNbins=" \
384 << (1LLU<<P.winBinNbits)*P.winAnchorDistNbins <<endl;
385 } else {
386 //redefine winBinNbits
387 P.winBinNbits = (uint) floor( log2( max( max(4LLU,P.alignIntronMax), (P.alignMatesGapMax==0 ? 1000LLU : P.alignMatesGapMax) ) /4 ) + 0.5);
388 P.winBinNbits = max( P.winBinNbits, (uint) floor(log2(nGenome/40000+1)+0.5) );
389 //ISSUE - to be fixed in STAR3: if alignIntronMax>0 but alignMatesGapMax==0, winBinNbits will be defined by alignIntronMax
390 P.inOut->logMain << "To accommodate alignIntronMax="<<P.alignIntronMax<<" redefined winBinNbits="<< P.winBinNbits <<endl;
391 };
392
393 if (P.winBinNbits > pGe.gChrBinNbits) {
394 P.inOut->logMain << "winBinNbits=" <<P.winBinNbits <<" > " << "pGe.gChrBinNbits=" << pGe.gChrBinNbits << " redefining:\n";
395 P.winBinNbits=pGe.gChrBinNbits;
396 P.inOut->logMain << "winBinNbits=" <<P.winBinNbits <<endl;
397 };
398
399
400 if (P.alignIntronMax==0 && P.alignMatesGapMax==0) {
401 } else {
402 //redefine winFlankNbins,winAnchorDistNbins
403 P.winFlankNbins=max(P.alignIntronMax,P.alignMatesGapMax)/(1LLU<<P.winBinNbits)+1;
404 P.winAnchorDistNbins=2*P.winFlankNbins;
405 P.inOut->logMain << "To accommodate alignIntronMax="<<P.alignIntronMax<<" and alignMatesGapMax="<<P.alignMatesGapMax<<\
406 ", redefined winFlankNbins="<<P.winFlankNbins<<" and winAnchorDistNbins="<<P.winAnchorDistNbins<<endl;
407 };
408
409 P.winBinChrNbits=pGe.gChrBinNbits-P.winBinNbits;
410 P.winBinN = nGenome/(1LLU << P.winBinNbits)+1;//this may be changed later
411
412 if (pGe.gType==101) {//SuperTranscriptome
413 superTr = new SuperTranscriptome (P);
414 superTr->load(G, chrStart, chrLength);
415
416 //genomeOut
417 P.pGeOut.gDir=pGe.gDir+"/fullGenome/";
418 genomeOut.convYes=true;
419 genomeOut.gapsAreJunctions=true;
420 genomeOut.convFile=pGe.gDir+"/fullGenome/conversionToFullGenome.tsv";
421
422 genomeOut.g = new Genome(P,P.pGeOut);
423 genomeOut.g->genomeOut=genomeOut;
424 genomeOut.g->genomeOutLoad();
425
426 //debug checks
427 // for (auto &b : genomeOut.g->genomeOut.convBlocks) {
428 // uint64 g1=*(genomeOut.g->G+b[2]);
429 // uint64 g2=*(G+b[0]);
430 // if (g1!=g2)
431 // cout <<b[0]<<" "<<b[1]<<" "<<b[2]<<endl;
432 // };
433 };
434
435
436 if (P1.pGe.transform.typeString=="Haploid") {
437 pGe.transform.type=1;
438 } else if (P1.pGe.transform.typeString=="Diploid") {
439 pGe.transform.type=2;
440 } else {//TODO check for wrong values
441 pGe.transform.type=0;
442 };
443
444 if (pGe.transform.outYes) {
445 if (pGe.transform.type == 0) {
446 exitWithError("EXITING because of FATAL INPUT ERROR: outTransformOutput is set, but the genome " +pGe.gDir + " was generated without transformation\n"
447 + "SOLUTION: use the default--outTransformOutput None, or re-generate the genome with transformation options.\n"
448 ,std::cerr, P.inOut->logMain, EXIT_CODE_MEMORY_ALLOCATION, P);
449
450 } else {//transform genome coordinates
451 //genomeOut
452 P.pGeOut.gDir=pGe.gDir+"/OriginalGenome/";
453 genomeOut.convYes=true;
454 genomeOut.gapsAreJunctions=false;
455 genomeOut.convFile=pGe.gDir+"/transformGenomeBlocks.tsv";
456
457
458 genomeOut.g = new Genome(P,P.pGeOut);
459 genomeOut.g->genomeOut=genomeOut;
460 genomeOut.g->genomeOutLoad();
461 };
462 };
463
464 if (P.pGe.gLoad=="LoadAndExit" || P.pGe.gLoad=="Remove") {
465 exit(0);
466 };
467 };
468
469 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
470 //
loadSJDB(string & genDir)471 void Genome::loadSJDB(string &genDir)
472 {
473 //splice junctions database
474 if (nGenome==chrStart[nChrReal]) {//no sjdb
475 sjdbN=0;
476 sjGstart=chrStart[nChrReal]+1; //not sure why I need that
477 } else {//there are sjdb chromosomes
478 ifstream sjdbInfo((genDir+"/sjdbInfo.txt").c_str());
479 if (sjdbInfo.fail()) {
480 ostringstream errOut;
481 errOut << "EXITING because of FATAL error, could not open file " << (genDir+"/sjdbInfo.txt") <<"\n";
482 errOut << "SOLUTION: check that the path to genome files, specified in --genomeDir is correct and the files are present, and have user read permsissions\n" <<flush;
483 exitWithError(errOut.str(),std::cerr, P.inOut->logMain, EXIT_CODE_INPUT_FILES, P);
484 };
485
486
487 sjdbInfo >> sjdbN >> pGe.sjdbOverhang;
488 P.inOut->logMain << "Processing splice junctions database sjdbN=" <<sjdbN<<", pGe.sjdbOverhang=" <<pGe.sjdbOverhang <<" \n";
489
490 sjChrStart=nChrReal;
491 sjGstart=chrStart[sjChrStart];
492
493 //fill the sj-db to genome translation array
494 sjDstart=new uint [sjdbN];
495 sjAstart=new uint [sjdbN];
496 sjdbStart=new uint [sjdbN];
497 sjdbEnd=new uint [sjdbN];
498
499 sjdbMotif=new uint8 [sjdbN];
500 sjdbShiftLeft=new uint8 [sjdbN];
501 sjdbShiftRight=new uint8 [sjdbN];
502 sjdbStrand=new uint8 [sjdbN];
503
504 for (uint ii=0;ii<sjdbN;ii++) {//get the info about junctions from sjdbInfo.txt
505 {
506 uint16 d1,d2,d3,d4;
507 sjdbInfo >> sjdbStart[ii] >> sjdbEnd[ii] >> d1 >> d2 >> d3 >> d4;
508 sjdbMotif[ii] = (uint8) d1;
509 sjdbShiftLeft[ii] = (uint8) d2;
510 sjdbShiftRight[ii] = (uint8) d3;
511 sjdbStrand[ii] = (uint8) d4;
512 };
513 sjDstart[ii] = sjdbStart[ii] - pGe.sjdbOverhang;
514 sjAstart[ii] = sjdbEnd[ii] + 1;
515 if (sjdbMotif[ii]==0) {//shinon-canonical junctions back to their true coordinates
516 sjDstart[ii] += sjdbShiftLeft[ii];
517 sjAstart[ii] += sjdbShiftLeft[ii];
518 };
519 };
520 };
521 };